blog
blog copied to clipboard
socket 源码阅读 - socket
系统调用 socket(family, type, protocol) 创建一个新 socket,并返回其文件描述符。
// net/socket.c
int __sys_socket(int family, int type, int protocol)
{
int retval;
struct socket *sock;
int flags;
flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return retval;
return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}
__sys_socket 中主要有两个函数调用。
- sock_create 函数创建一个新的 socket 指针。
- sock_map_fd 函数为新建的 socket 指针打开一个文件描述符。
sock_create
// net/socket.c
/**
* sock_create - creates a socket
* @family: protocol family (AF_INET, ...)
* @type: communication type (SOCK_STREAM, ...)
* @protocol: protocol (0, ...)
* @res: new socket
*
* A wrapper around __sock_create().
* Returns 0 or an error. This function internally uses GFP_KERNEL.
*/
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
/**
* __sock_create - creates a socket
* @net: net namespace
* @family: protocol family (AF_INET, ...)
* @type: communication type (SOCK_STREAM, ...)
* @protocol: protocol (0, ...)
* @res: new socket
* @kern: boolean for kernel space sockets
*
* Creates a new socket and assigns it to @res, passing through LSM.
* Returns 0 or an error. On failure @res is set to %NULL. @kern must
* be set to true if the socket resides in kernel space.
* This function internally uses GFP_KERNEL.
*/
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
// Check protocol is in range
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
...
// 分配 socket 空间
// 创建 socket.sock
分配 socket 空间
// net/socket.c - __sock_create
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
sock->type = type;
sock_alloc() 申请分配 socket 结构空间。
// net/socket.c
/**
* sock_alloc - allocate a socket
*
* Allocate a new inode and socket object. The two are bound together
* and initialised. The socket is then returned. If we are out of inodes
* NULL is returned. This functions uses GFP_KERNEL internally.
*/
struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
return NULL;
sock = SOCKET_I(inode);
inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_op = &sockfs_inode_ops;
return sock;
}
sock_mnt 是 socket 网络文件系统的根结点,new_inode_pseudo 函数创建一个新的 inode 节点。
new_inode_pseudo 会通过 sock_mnt->mnt_sb 指向 socket 网络文件系统的超级块来创建一个挂载到 socket 网络文件系统上的 inode。
// fs/inode.c
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
...
}
static struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
...
}
sock_mnt->mnt_sb->op->alloc_inode 指向 sock_alloc_inode 函数。
// net/socket.c
static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei;
ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
init_waitqueue_head(&ei->socket.wq.wait);
ei->socket.wq.fasync_list = NULL;
ei->socket.wq.flags = 0;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL;
return &ei->vfs_inode;
}
sock_alloc_inode 函数在 slab 高速缓存的 inode 缓存区直接分配 socket_alloc 结构。
// include/net/sock.h
struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};
socket_alloc 结构体中包含了 socket 和 inode 结构,这两块内存都会被分配,sock_alloc_inode 函数还顺带初始化了 socket 结构,返回 socket_alloc 中的 vfs_inode 结构地址。
所以,后面可以通过内联函数 SOCKET_I 从 inode 指针取得 socket 结构地址。
// include/net/sock.h
static inline struct socket *SOCKET_I(struct inode *inode)
{
return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}
// include/linux/kernel.h
#define container_of(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
((type *)(__mptr - offsetof(type, member))); })
// tools/include/linux/kernel.h
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
创建 socket.sock
上一步创建并初始化 socket 结构,下面开始创建特定网络协议的 sock 结构。
// net/socket.c - __sock_create
...
rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
rcu_read_unlock();
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
pf 表示对应的 family 网络协议族结构,从 net_families 中获得。
对于 INET 协议族, pf->create 指向 inet_create 函数。
// net/socket.c
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
sock_map_fd
sock_map_fd 将为 socket 打开一个文件,并返回文件描述符 fd。
// net/socket.c
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = get_unused_fd_flags(flags);
if (unlikely(fd < 0)) {
sock_release(sock);
return fd;
}
newfile = sock_alloc_file(sock, flags, NULL);
if (!IS_ERR(newfile)) {
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
- get_unused_fd_flags 申请一个文件描述符。
- sock_alloc_file 为 socket 申请一个文件指针。
- fd_install 把 fd 和文件指针建立了关联。
// net/socket.c
/**
* sock_alloc_file - Bind a &socket to a &file
* @sock: socket
* @flags: file status flags
* @dname: protocol name
*
* Returns the &file bound with @sock, implicitly storing it
* in sock->file. If dname is %NULL, sets to "".
* On failure the return is a ERR pointer (see linux/err.h).
* This function uses GFP_KERNEL internally.
*/
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct file *file;
if (!dname)
dname = sock->sk ? sock->sk->sk_prot_creator->name : "";
file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
O_RDWR | (flags & O_NONBLOCK),
&socket_file_ops);
if (IS_ERR(file)) {
sock_release(sock);
return file;
}
sock->file = file;
file->private_data = sock;
stream_open(SOCK_INODE(sock), file);
return file;
}
SOCK_INODE(sock) 是 SOCKET_I 的逆向操作,返回 socket 指针的 inode 结构指针。
alloc_file_pseudo 函数为 inode 申请一个文件指针,并指定该 socket 文件的操作函数。
// net/socket.c
/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
.show_fdinfo = sock_show_fdinfo,
};
之后互相关联 socket 指针和文件指针,并为这个文件指针打开一个流。
inet_create
// net/ipv4/af_inet.c
/*
* Create an inet socket.
*/
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
// socket 的状态置为未连接。
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
// 在 inetsw 对应类型的链表中查找符合的 inet_protosw 结构体
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
...
// 设置 socket 的套接口层处理函数集
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOBUFS;
// 分配 inet_sock 结构
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
// 初始化 sock
...
}
inet sock 实际上由 sk_alloc 来分配。
// net/core/sock.c
/**
* sk_alloc - All socket objects are allocated here
* @net: the applicable net namespace
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
* @kern: is this to be a kernel socket?
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot, int kern)
{
struct sock *sk;
sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
if (sk) {
sk->sk_family = family;
/*
* See comment in struct sock definition to understand
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
get_net(net);
sock_inuse_add(net, 1);
}
sock_net_set(sk, net);
refcount_set(&sk->sk_wmem_alloc, 1);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
}
return sk;
}
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
int family)
{
struct sock *sk;
struct kmem_cache *slab;
slab = prot->slab;
if (slab != NULL) {
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
if (want_init_on_alloc(priority))
sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
if (sk != NULL) {
if (security_sk_alloc(sk, family, priority))
goto out_free;
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
}
return sk;
out_free_sec:
security_sk_free(sk);
out_free:
if (slab != NULL)
kmem_cache_free(slab, sk);
else
kfree(sk);
return NULL;
}