Linux内核中netlink协议族的实现(下)

Linux内核中netlink协议族的实现(上)
本文档的Copyleft归yfydz所有，使用GPL发布，可以自由拷贝，转载，转载时请保持文档的完整性，严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源：http://yfydz.cublog.cn
1. 前言

netlink协议族是Linux内核网络部分的一个固定部分, 一旦在内核配置中选了网络支持就自动带了而不能单独去掉。
netlink的实现源码在net/netlink目录下，主要是net/netlink/af_netlink.c文件。

以下内核代码版本为2.6.19.2, 如无特别说明代码取自net/netlink/af_netlink.c。

2. 数据结构

netlink套接口结构:
/* net/netlink/af_netlink.c */
struct netlink_sock {
 /* struct sock has to be the first member of netlink_sock */
 struct sock  sk;
 u32   pid; // 自己的pid, 通常是0
 u32   dst_pid; // 对方的pid
 u32   dst_group; // 对方的组
 u32   flags;
 u32   subscriptions;
 u32   ngroups; // 多播组数量
 unsigned long  *groups; // 多播组号
 unsigned long  state;
 wait_queue_head_t wait; // 等待队列,用于处理接收发送包时的top half
 struct netlink_callback *cb;  // 回调结构,包含回调函数
 spinlock_t  cb_lock;
 void   (*data_ready)(struct sock *sk, int bytes); // 数据到达时
                                //的操作, netlink可有不同类型, 如ROUTE, FIREWALL, ARPD等,                                  //每种类型都自己定义的data_ready处理
 struct module  *module;
};
这个结构先是包含一个标准的struct sock结构,后面又跟和netlink相关的特有相关数据,内核中其他协议的sock也是类似定义的, 注意sock结构必须放在第一位,这是为了可以直接将sock的指针转为netlink_sock的指针。
 
netlink sock的表:
struct netlink_table {
 struct nl_pid_hash hash; // 根据pid进行HASH的netlink sock链表, 相当于客户端链表
 struct hlist_head mc_list; // 多播的sock链表
 unsigned long *listeners;  // 监听者标志
 unsigned int nl_nonroot;
 unsigned int groups; // 每个netlink的协议类型可以定义多个组, 8的倍数,最小是32
 struct module *module;
 int registered;
};
最大可有MAX_LINKS(32)个表，处理不同协议类型的netlink套接口, 注意由于是自身的通信, 本机同时作为服务器和客户端, 服务端需要一个套接口对应, 每个客户端也要有一个套接口对应, 多个客户端的套接口形成一个链表.
struct nl_pid_hash {
 struct hlist_head *table; // 链表节点
 unsigned long rehash_time; // 重新计算HASH的时间间隔
 unsigned int mask;
 unsigned int shift;
 unsigned int entries;  // 链表节点数
 unsigned int max_shift; // 最大幂值
 u32 rnd; // 随机数
};
其他和netlink数据相关的数据结构在include/linux/netlink.h中定义, 不过这些结构更多用在各具体的netlink对象的实现中, 在基本netlink套接口中到是用得不多。

3. af_netlink协议初始化

static int __init netlink_proto_init(void)
{
 struct sk_buff *dummy_skb;
 int i;
 unsigned long max;
 unsigned int order;
// 登记netlink_proto结构, 该结构定义如下:
// static struct proto netlink_proto = {
//  .name   = "NETLINK",
//  .owner   = THIS_MODULE,
//  .obj_size = sizeof(struct netlink_sock),
// };
// 最后一个参数为0, 表示不进行slab的分配, 只是简单的将netlink_proto结构
// 挂接到系统的网络协议链表中,这个结构最主要是告知了netlink sock结构的大小
 int err = proto_register(&netlink_proto, 0);
 if (err != 0)
  goto out;
 BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb));
// 分配MAX_LINKS个netlink表结构
 nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
 if (!nl_table)
  goto panic;
// 以下根据系统内存大小计算最大链表元素个数
// PAGE_SHIFT是每页大小的2的幂,对i386是12,即每页是4K,2^12
// 对于128M内存的机器,max计算值是(128*1024) >> (21-12) = 256
// 对于64M内存的机器,max计算值是(64*1024) >> (23-12) = 32
 if (num_physpages >= (128 * 1024))
  max = num_physpages >> (21 - PAGE_SHIFT);
 else
  max = num_physpages >> (23 - PAGE_SHIFT);
// 根据max再和PAGE_SHIFT计算总内存空间相应的幂值order
 order = get_bitmask_order(max) - 1 + PAGE_SHIFT;
// max是最大节点数
 max = (1UL << order) / sizeof(struct hlist_head);
// order是max对于2的幂数
 order = get_bitmask_order(max > UINT_MAX ? UINT_MAX : max) - 1;
 for (i = 0; i < MAX_LINKS; i++) {
  struct nl_pid_hash *hash = &nl_table[i].hash;
// 为netlink的每个协议类型分配HASH表链表头
  hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table));
  if (!hash->table) {
   while (i-- > 0)
    nl_pid_hash_free(nl_table[i].hash.table,
       1 * sizeof(*hash->table));
   kfree(nl_table);
   goto panic;
  }
// 初始化HASH表参数
  memset(hash->table, 0, 1 * sizeof(*hash->table));
// 最大幂数
  hash->max_shift = order;
  hash->shift = 0;
  hash->mask = 0;
  hash->rehash_time = jiffies;
 }
// 登记netlink协议族的的操作结构
 sock_register(&netlink_family_ops);
#ifdef CONFIG_PROC_FS
 proc_net_fops_create("netlink", 0, &netlink_seq_fops);
#endif
 /* The netlink device handler may be needed early. */
// 初始化路由netlink
 rtnetlink_init();
out:
 return err;
panic:
 panic("netlink_init: Cannot allocate nl_table\n");
}
core_initcall(netlink_proto_init);
 

4. 建立netlink套接口

4.1  建立对应客户端的套接口
// netlink协议族操作, 在用户程序使用socket打开netlink类型的socket时调用,
// 相应的create函数在__sock_create(net/socket.c)函数中调用:
static struct net_proto_family netlink_family_ops = {
 .family = PF_NETLINK,
 .create = netlink_create,
 .owner = THIS_MODULE, /* for consistency 8) */
};
// 在用户空间每次打开netlink socket时都会调用此函数:
static int netlink_create(struct socket *sock, int protocol)
{
 struct module *module = NULL;
 struct netlink_sock *nlk;
 unsigned int groups;
 int err = 0;
// sock状态初始化
 sock->state = SS_UNCONNECTED;
// 对netlink sock的类型和协议(实际是netlink_family类型)限制检查
 if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
  return -ESOCKTNOSUPPORT;
 if (protocol<0 || protocol >= MAX_LINKS)
  return -EPROTONOSUPPORT;
 netlink_lock_table();
#ifdef CONFIG_KMOD
// 如果相应的netlink协议是模块又没有加载的话先加载该模块
 if (!nl_table[protocol].registered) {
  netlink_unlock_table();
  request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
  netlink_lock_table();
 }
#endif
 if (nl_table[protocol].registered &&
     try_module_get(nl_table[protocol].module))
  module = nl_table[protocol].module;
// groups这个值在函数后面也没见用上, 这句没意义
 groups = nl_table[protocol].groups;
 netlink_unlock_table();
// 真正的建立netlink sock的函数
 if ((err = __netlink_create(sock, protocol)) < 0)
  goto out_module;
 nlk = nlk_sk(sock->sk);
 nlk->module = module;
out:
 return err;
out_module:
 module_put(module);
 goto out;
}

// 基本函数
static int __netlink_create(struct socket *sock, int protocol)
{
 struct sock *sk;
 struct netlink_sock *nlk;
// netlink sock的基本操作
 sock->ops = &netlink_ops;
// 分配sock结构, 通过netlink_proto中的obj_size指出了netlink sock的大小
 sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
 if (!sk)
  return -ENOMEM;
// 初始化sock基本数据, 将sock和socket关联起来
 sock_init_data(sock, sk);
// 将普通sock转为netlink sock,实际只是重新定义的一下指针类型,指针本身值不变
 nlk = nlk_sk(sk);
// 初始化sock的锁
 spin_lock_init(&nlk->cb_lock);
// 初始化等待队列
 init_waitqueue_head(&nlk->wait);
// sock的析构函数,释放接收队列中的skb数据包
 sk->sk_destruct = netlink_sock_destruct;
 sk->sk_protocol = protocol;
// 注意这里没有重新定义sk的sk_data_ready函数
// 在sock_init_data()函数中将sk_data_ready定义为sock_def_readable()函数
 return 0;
}

用户空间使用socket(2)系统调用打开netlink类型的套接口时, 在内核中会调用sys_sock()函数, 然后是调用__sock_create()函数, 在其中调用netlink协议族的create()函数, 即netlink_create()函数.
 
4.2 建立服务器端的套接口

以前也介绍过另一个建立netlink sock的函数netlink_kernel_create, 一般是在netlink的各种协议类型模块初始化时调用的, 而不是socket系统调用时调用的, 每个netlink协议初始化是只调用一次, 建立一个内核中的netlink接口, 相当于服务器, 其中也调用了__netlink_create()函数:
/*
 * We export these functions to other modules. They provide a
 * complete set of kernel non-blocking support for message
 * queueing.
 */
struct sock *
netlink_kernel_create(int unit, unsigned int groups,
                      void (*input)(struct sock *sk, int len),
                      struct module *module)
{
 struct socket *sock;
 struct sock *sk;
 struct netlink_sock *nlk;
 unsigned long *listeners = NULL;
 BUG_ON(!nl_table);
 if (unit<0 || unit>=MAX_LINKS)
  return NULL;
// 这里的lite表示只是简单分配一个socket,没有真正初始化
 if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
  return NULL;
// 用这个lite sock再建立netlink sock
 if (__netlink_create(sock, unit) < 0)
  goto out_sock_release;
 if (groups < 32)
  groups = 32;
// listerns是个位图对应groups中每个元素
 listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
 if (!listeners)
  goto out_sock_release;
 sk = sock->sk;
// 重新定义了sk_data_ready函数
 sk->sk_data_ready = netlink_data_ready;
// 这个是相应的各netlink协议数据处理函数
 if (input)
  nlk_sk(sk)->data_ready = input;
 if (netlink_insert(sk, 0))
  goto out_sock_release;
 nlk = nlk_sk(sk);
 nlk->flags |= NETLINK_KERNEL_SOCKET;
 netlink_table_grab();
// 注册到相应unit的netlink协议表中
 nl_table[unit].groups = groups;
 nl_table[unit].listeners = listeners;
 nl_table[unit].module = module;
// 该标志表示该项被登记
 nl_table[unit].registered = 1;
 netlink_table_ungrab();
 return sk;
out_sock_release:
 kfree(listeners);
 sock_release(sock);
 return NULL;
}

5. netlink套接口的操作

在__netlink_create函数中定义了netlink套接口的操作结构为netlink_ops:
 sock->ops = &netlink_ops;
该结构定义如下:
static const struct proto_ops netlink_ops = {
 .family = PF_NETLINK,
 .owner = THIS_MODULE,
 .release = netlink_release,
 .bind =  netlink_bind,
 .connect = netlink_connect,
 .socketpair = sock_no_socketpair, // 无定义
 .accept = sock_no_accept, // 无定义
 .getname = netlink_getname,
 .poll =  datagram_poll,
 .ioctl = sock_no_ioctl, // 无定义
 .listen = sock_no_listen, // 无定义
 .shutdown = sock_no_shutdown, // 无定义
 .setsockopt = netlink_setsockopt,
 .getsockopt = netlink_getsockopt,
 .sendmsg = netlink_sendmsg,
 .recvmsg = netlink_recvmsg,
 .mmap =  sock_no_mmap, // 无定义
 .sendpage = sock_no_sendpage, // 无定义
};

5.1 释放
在close(2)时调用
static int netlink_release(struct socket *sock)
{
 struct sock *sk = sock->sk;
 struct netlink_sock *nlk;
 if (!sk)
  return 0;
// 将套接口sk从系统sk链表和绑定链表中断开
 netlink_remove(sk);
 nlk = nlk_sk(sk);
 spin_lock(&nlk->cb_lock);
 if (nlk->cb) {
// 释放netlink控制块处理
  if (nlk->cb->done)
   nlk->cb->done(nlk->cb);
  netlink_destroy_callback(nlk->cb);
  nlk->cb = NULL;
 }
 spin_unlock(&nlk->cb_lock);
 /* OK. Socket is unlinked, and, therefore,
    no new packets will arrive */
// 设置sk状态为SOCK_DEAD, 断开sock和sk的互指
 sock_orphan(sk);
 sock->sk = NULL;
// 唤醒所有等待队列
 wake_up_interruptible_all(&nlk->wait);
// 清空写队列
 skb_queue_purge(&sk->sk_write_queue);
 if (nlk->pid && !nlk->subscriptions) {
// 发送释放通知
  struct netlink_notify n = {
      .protocol = sk->sk_protocol,
      .pid = nlk->pid,
       };
  atomic_notifier_call_chain(&netlink_chain,
    NETLINK_URELEASE, &n);
 } 
// 减少模块计数
 if (nlk->module)
  module_put(nlk->module);
// 相当于加锁
 netlink_table_grab();
 if (nlk->flags & NETLINK_KERNEL_SOCKET) {
// 释放内核中的netlink服务器端
  kfree(nl_table[sk->sk_protocol].listeners);
  nl_table[sk->sk_protocol].module = NULL;
  nl_table[sk->sk_protocol].registered = 0;
 } else if (nlk->subscriptions)
  netlink_update_listeners(sk);
// 相当于解锁
 netlink_table_ungrab();
// 释放该netlink sock的多播组
 kfree(nlk->groups);
 nlk->groups = NULL;
// 释放sock
 sock_put(sk);
 return 0;
}

5.2 绑定bind
绑定通常是针对服务端
static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
 struct sock *sk = sock->sk;
 struct netlink_sock *nlk = nlk_sk(sk);
 struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
 int err;
// 检查一下地址的协议族是否为AF_NETLINK 
 if (nladdr->nl_family != AF_NETLINK)
  return -EINVAL;
 /* Only superuser is allowed to listen multicasts */
 if (nladdr->nl_groups) {
// 指定了多播组, 这是需要root权限
  if (!netlink_capable(sock, NL_NONROOT_RECV))
   return -EPERM;
  if (nlk->groups == NULL) {
// 分配多播组空间
   err = netlink_alloc_groups(sk);
   if (err)
    return err;
  }
 }
 if (nlk->pid) {
// 如果sock的pid非0, 检查是否匹配在nladdr地址结构中指定的pid
  if (nladdr->nl_pid != nlk->pid)
   return -EINVAL;
 } else {
// sock的pid为0, 根据nladdr是否指定pid来执行插入或
  err = nladdr->nl_pid ?
   netlink_insert(sk, nladdr->nl_pid) :
   netlink_autobind(sock);
  if (err)
   return err;
 }
// 非多播情况时就可以返回成功了
 if (!nladdr->nl_groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
  return 0;
 netlink_table_grab();
// 多播情况下更新sock参数
 netlink_update_subscriptions(sk, nlk->subscriptions +
                                  hweight32(nladdr->nl_groups) -
                                  hweight32(nlk->groups[0]));
 nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | nladdr->nl_groups;
 netlink_update_listeners(sk);
 netlink_table_ungrab();
 return 0;
}

// 根据pid插入
static int netlink_insert(struct sock *sk, u32 pid)
{
// netlink相应协议的HASH结构
 struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
 struct hlist_head *head;
// 缺省错误为地址已经被使用
 int err = -EADDRINUSE;
 struct sock *osk;
 struct hlist_node *node;
 int len;
 netlink_table_grab();
// 根据pid查找相应HASH链表头
 head = nl_pid_hashfn(hash, pid);
 len = 0;
// 检查pid是否已经在链表中, 有则失败
 sk_for_each(osk, node, head) {
  if (nlk_sk(osk)->pid == pid)
   break;
  len++;
 }
 if (node)
  goto err;
// 缺省错误改为系统忙
 err = -EBUSY;
// 如果sock的pid不为0, 错误, 只有pid为0的sock才能执行该函数
// sock的pid不为0时不会再进行insert操作了
 if (nlk_sk(sk)->pid)
  goto err;

// 缺省错误改为无内存空间
 err = -ENOMEM;
 if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX))
  goto err;
// 如果链表不为空而且链表长度数量过长,会调整HASH表,重新获取HASH链表头
// 不过这种情况很少发生
 if (len && nl_pid_hash_dilute(hash, len))
  head = nl_pid_hashfn(hash, pid);
 hash->entries++;
// 将pid赋值给sock的pid参数
 nlk_sk(sk)->pid = pid;
// 将sock节点添加进HASH链表
 sk_add_node(sk, head);
 err = 0;
err:
 netlink_table_ungrab();
 return err;
}

// 未指定pid时的自动绑定
// 实际是选一个没用过的pid后再进行插入操作
static int netlink_autobind(struct socket *sock)
{
// 从socket找到sock
 struct sock *sk = sock->sk;
// netlink相应协议的HASH结构
 struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash;
 struct hlist_head *head;
 struct sock *osk;
 struct hlist_node *node;
// pid取为当前进程的组ID
 s32 pid = current->tgid;
 int err;
// 有符号32位数
 static s32 rover = -4097;
retry:
 cond_resched();
 netlink_table_grab();
// 找合适的HASH链表头
 head = nl_pid_hashfn(hash, pid);
 sk_for_each(osk, node, head) {
// 查找链表中是否已经有该pid
  if (nlk_sk(osk)->pid == pid) {
// 存在, 则更新pid, 重新检查, 注意这时的pid是个负数
   /* Bind collision, search negative pid values. */
   pid = rover--;
   if (rover > -4097)
    rover = -4097;
   netlink_table_ungrab();
   goto retry;
  }
 }
 netlink_table_ungrab();
// 此时的pid是一个负数转换为无符号32位数, 将是一个非常大的数
// 执行正常的pid插入
 err = netlink_insert(sk, pid);
 if (err == -EADDRINUSE)
  goto retry;
 /* If 2 threads race to autobind, that is fine.  */
 if (err == -EBUSY)
  err = 0;
 return err;
}
// 更新subscriotions
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
 struct netlink_sock *nlk = nlk_sk(sk);
 if (nlk->subscriptions && !subscriptions)
  __sk_del_bind_node(sk);
 else if (!nlk->subscriptions && subscriptions)
  sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
 nlk->subscriptions = subscriptions;
}
// 更新listeners
static void
netlink_update_listeners(struct sock *sk)
{
 struct netlink_table *tbl = &nl_table[sk->sk_protocol];
 struct hlist_node *node;
 unsigned long mask;
 unsigned int i;
 for (i = 0; i < NLGRPSZ(tbl->groups)/sizeof(unsigned long); i++) {
  mask = 0;
// 遍历多播链表生成多播组的掩码
  sk_for_each_bound(sk, node, &tbl->mc_list)
   mask |= nlk_sk(sk)->groups[i];
  tbl->listeners[i] = mask;
 }
 /* this function is only called with the netlink table "grabbed", which
  * makes sure updates are visible before bind or setsockopt return. */
}

......待续.....
Linux内核中netlink协议族的实现(下)

相关推荐