blog
blog copied to clipboard
socket 源码阅读
Unix/Linux 的哲学思想是”一切皆文件“。所以 socket 也是“文件”。
0x01 socket 文件系统
sock_fs_type 结构定义了名为 sockfs 的文件系统,是一个虚拟文件系统。
static struct file_system_type sock_fs_type = {
.name = "sockfs",
.init_fs_context = sockfs_init_fs_context,
.kill_sb = kill_anon_super,
};
在内核初始化的时候,socket 文件系统会被初始化。
// net/socket.c
static int __init sock_init(void)
{
int err;
// Initialize the network sysctl infrastructure.
err = net_sysctl_init();
if (err)
goto out;
// Initialize skbuff SLAB cache
skb_init();
// Initialize the protocols module.
init_inodecache();
// 注册网络文件系统
err = register_filesystem(&sock_fs_type);
if (err)
goto out_fs;
// 挂载网络文件系统
sock_mnt = kern_mount(&sock_fs_type);
if (IS_ERR(sock_mnt)) {
err = PTR_ERR
(sock_mnt);
goto out_mount;
}
...
}
core_initcall(sock_init); /* early initcall */
- register_filesystem 将 socket 文件系统注册到 Linux 内核中。
- kern_mount 将 socket 文件系统挂载到内核中,挂载之后文件系统才能被使用。
0x02 socket 结构
通用的 BSD socket 结构体定义:
// include/linux/net.h
/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @type: socket type (%SOCK_STREAM, etc)
* @flags: socket flags (%SOCK_NOSPACE, etc)
* @ops: protocol specific socket operations
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wq: wait queue for several uses
*/
struct socket {
socket_state state;
short type;
unsigned long flags;
struct file *file;
struct sock *sk;
const struct proto_ops *ops;
struct socket_wq wq;
};
state 表示 socket 的状态,默认是 SS_FREE 表示未分配 socket。
// include/uapi/linux/net.h
typedef enum {
SS_FREE = 0, /* not allocated */
SS_UNCONNECTED, /* unconnected to any socket */
SS_CONNECTING, /* in process of connecting */
SS_CONNECTED, /* connected to socket */
SS_DISCONNECTING /* in process of disconnecting */
} socket_state;
我们常见的 socket 类型一般是流( SOCK_STREAM )和报文( SOCK_DGRAM )。类型定义有下列几种。
// include/linux/net.h
/**
* enum sock_type - Socket types
* @SOCK_STREAM: stream (connection) socket
* @SOCK_DGRAM: datagram (conn.less) socket
* @SOCK_RAW: raw socket
* @SOCK_RDM: reliably-delivered message
* @SOCK_SEQPACKET: sequential packet socket
* @SOCK_DCCP: Datagram Congestion Control Protocol socket
* @SOCK_PACKET: linux specific way of getting packets at the dev level.
* For writing rarp and other similar things on the user level.
*
* When adding some new socket type please
* grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS
* overrides this enum for binary compat reasons.
*/
enum sock_type {
SOCK_STREAM = 1,
SOCK_DGRAM = 2,
SOCK_RAW = 3,
SOCK_RDM = 4,
SOCK_SEQPACKET = 5,
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};
socket 的 sock 结构指针用来指向与网络协议相关的内容。
// include/net/sock.h
struct sock {
/*
* Now struct inet_timewait_sock also uses sock_common, so please just
* don't add nothing before this first member (__sk_common) --acme
*/
struct sock_common __sk_common;
#define sk_node __sk_common.skc_node
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_XPS
#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
#endif
#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
#define sk_hash __sk_common.skc_hash
#define sk_portpair __sk_common.skc_portpair
#define sk_num __sk_common.skc_num
#define sk_dport __sk_common.skc_dport
#define sk_addrpair __sk_common.skc_addrpair
#define sk_daddr __sk_common.skc_daddr
#define sk_rcv_saddr __sk_common.skc_rcv_saddr
#define sk_family __sk_common.skc_family
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
#define sk_net __sk_common.skc_net
#define sk_v6_daddr __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
#define sk_cookie __sk_common.skc_cookie
#define sk_incoming_cpu __sk_common.skc_incoming_cpu
#define sk_flags __sk_common.skc_flags
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
atomic_t sk_drops;
int sk_rcvlowat;
struct sk_buff_head sk_error_queue;
struct sk_buff *sk_rx_skb_cache;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
* Note : rmem_alloc is in this structure to fill a hole
* on 64bit arches, not because its logically part of
* backlog.
*/
struct {
atomic_t rmem_alloc;
int len;
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
...
};
sock 作为网络协议的通用结构,sock_common 表示最小的网络层协议结构,具体的网络协议族内容是基于 sock 结构扩展的。
socket 支持多种网络协议族(family),常见的有 IPv4(PF_INET)、IPv6(PF_INET6)和 Unix(PF_UNIX)。如 IPv4 对应的 inet_sock 结构:
// include/net/inet_sock.h
/** struct inet_sock - representation of INET sockets
*
* @sk - ancestor class
* @pinet6 - pointer to IPv6 control block
* @inet_daddr - Foreign IPv4 addr
* @inet_rcv_saddr - Bound local IPv4 addr
* @inet_dport - Destination port
* @inet_num - Local port
* @inet_saddr - Sending source
* @uc_ttl - Unicast TTL
* @inet_sport - Source port
* @inet_id - ID counter for DF pkts
* @tos - TOS
* @mc_ttl - Multicasting TTL
* @is_icsk - is this an inet_connection_sock?
* @uc_index - Unicast outgoing device index
* @mc_index - Multicast device index
* @mc_list - Group array
* @cork - info to build ip hdr on each ip frag while socket is corked
*/
struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if IS_ENABLED(CONFIG_IPV6)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr sk.__sk_common.skc_daddr
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
#define inet_dport sk.__sk_common.skc_dport
#define inet_num sk.__sk_common.skc_num
...
};
0x03 协议族
socket 支持的多种协议族:
// include/linux/socket.h
/* Supported address families. */
#define AF_UNSPEC 0
#define AF_UNIX 1 /* Unix domain sockets */
#define AF_LOCAL 1 /* POSIX name for AF_UNIX */
#define AF_INET 2 /* Internet IP Protocol */
#define AF_AX25 3 /* Amateur Radio AX.25 */
#define AF_IPX 4 /* Novell IPX */
#define AF_APPLETALK 5 /* AppleTalk DDP */
#define AF_NETROM 6 /* Amateur Radio NET/ROM */
#define AF_BRIDGE 7 /* Multiprotocol bridge */
#define AF_ATMPVC 8 /* ATM PVCs */
#define AF_X25 9 /* Reserved for X.25 project */
#define AF_INET6 10 /* IP version 6 */
...
/* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
#define PF_UNIX AF_UNIX
#define PF_LOCAL AF_LOCAL
#define PF_INET AF_INET
...
PF_INET(protocol families)等价于 AF_INET(address family)。
内核中是这样定义一个协议族的:
// include/linux/net.h
struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock,
int protocol, int kern);
struct module *owner;
};
// net/socket.c
static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
inet_family_ops 就是 PF_INET 的协议族结构,上面提到的 inet_sock 就是由该结构中的创建函数 inet_create 创建的。
INET 协议族支持三种类型的套接字:流套接字(SOCK_STREAM),数据报套接字(SOCK_DGRAM),原始套接字(SOCK_RAW)。 流套接字支持传输层的 TCP 协议,数据报套接字支持传输层的 UDP、网络层 ICMP 协议(ping 套接字),原始套接字则支持网络层的附属协议 ICMP、IGMP 等。
// net/ipv4/af_inet.c
/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
inet_protosw 结构是套接口层和协议层之间的桥梁,sw 表示 switch。
// include/net/protocol.h
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields/ form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
unsigned char flags; /* See INET_PROTOSW_* below. */
};
type 表示 socket 类型,protocol 表示传输层(严格上说不完全是传输层)协议号。
proto 和 proto_ops 结构分别对应了协议层接口操作函数集和套接口层(协议族类型)对应的操作函数集。
// include/net/sock.h
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
*/
struct proto {
void (*close)(struct sock *sk, long timeout);
int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int (*disconnect)(struct sock *sk, int flags);
struct sock * (*accept)(struct sock *sk, int flags, int *err, bool kern);
...
int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len);
int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len);
...
}
// include/linux/net.h
struct proto_ops {
int family;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len);
int (*connect) (struct socket *sock, struct sockaddr *vaddr, int sockaddr_len, int flags);
int (*socketpair)(struct socket *sock1, struct socket *sock2);
int (*accept) (struct socket *sock, struct socket *newsock, int flags, bool kern);
...
int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len);
int (*recvmsg) (struct socket *sock, struct msghdr *m, size_t total_len, int flags);
...
}
我们来看一下 INET 协议族的初始化过程。
// net/ipv4/af_inet.c
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc = -EINVAL;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
(void)sock_register(&inet_family_ops);
...
/*
* Add all the base protocols.
*/
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
...
}
fs_initcall(inet_init);
proto_register
// net/core/sock.c
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
int ret = -ENOBUFS;
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
prot->slab_flags,
prot->useroffset, prot->usersize,
NULL);
...
}
mutex_lock(&proto_list_mutex);
ret = assign_proto_idx(prot);
if (ret) {
mutex_unlock(&proto_list_mutex);
goto out_free_timewait_sock_slab_name;
}
list_add(&prot->node, &proto_list);
mutex_unlock(&proto_list_mutex);
return ret;
...
}
首先 proto_register 向内核 proto_list 双向链表中注册了几个协议 tcp_prot、udp_prot、raw_prot。通过查看 /proc/net/protocols 文件即可知道内核注册的网络协议信息。
sock_register
// net/socket.c
/*
* The protocol list. Each protocol is registered in here.
*/
static DEFINE_SPINLOCK(net_family_lock);
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
/**
* sock_register - add a socket protocol handler
* @ops: description of protocol
*
* This function is called by a protocol handler that wants to
* advertise its address family, and have it linked into the
* socket interface. The value ops->family corresponds to the
* socket system call protocol family.
*/
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
pr_info("NET: Registered protocol family %d\n", ops->family);
return err;
}
sock_register 向 net_families 协议族数组注册 INET 协议族。
inet_add_protocol
// net/ipv4/protocol.c
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
if (!prot->netns_ok) {
pr_err("Protocol %u is not namespace aware, cannot register.\n",
protocol);
return -EINVAL;
}
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
inet_add_protocol 函数在 inet_protos 数组里为 INET 协议族注册对应的协议处理结构体。
// include/net/protocol.h
/* This is used to register protocols. */
struct net_protocol {
int (*early_demux)(struct sk_buff *skb);
int (*early_demux_handler)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
/* This returns an error if we weren't able to handle the error. */
int (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
netns_ok:1,
/* does the protocol do more stringent
* icmp tag validation than simple
* socket lookup?
*/
icmp_strict_tag_validation:1;
};
// net/ipv4/af_inet.c
static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.early_demux_handler = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
.icmp_strict_tag_validation = 1,
};
net_protocol 结构定义了协议族中支持的网络协议的接受函数。当从网络层接收到 IP 数据包时,会在 inet_protos 中根据协议号找到对应的协议处理结构体,如处理 tcp 协议的 tcp_protocol,调用 handler 函数来接收数据。
inetsw & inetsw_array
/* Register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
这两个循环分别是初始化 inetsw 链表数组,和将 inetsw_array 中定义的结构按照套接字类型作为索引注册在 inetsw 上。
// net/ipv4/af_inet.c
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
*/
static struct list_head inetsw[SOCK_MAX];
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
break;
if (protocol == answer->protocol)
goto out_permanent;
last_perm = lh;
}
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
...
}
list_head 双向链表
// tools/include/linux/types.h
struct list_head {
struct list_head *next, *prev;
};
// tools/include/linux/list.h
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
/**
* list_for_each - iterate over a list
* @pos: the &struct list_head to use as a loop cursor.
* @head: the head for your list.
*/
#define list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)
/**
* list_entry - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_head within the struct.
*/
#define list_entry(ptr, type, member) \
container_of(ptr, type, member)