netfilter的规约处理
netfilter的规则处理
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: yfydz_no1@hotmail.com
来源:http://yfydz.cublog.cn
1. 前言 netfilter中的防火墙规则是通过用户层的iptables命令来进行编辑的。而规则都是从属于某个表的(见我以前关于 netfilter新表的文章)。一般在mangle表对数据进行修改,在nat表对数据进行NAT,在filter表进行过滤。所不同的是NAT表中的规则只对新包(NEW/RELATED)进行处理,而MANGLE和FILTER表中的规则对所有数据包都处理。 以下Linux内核代码版本为2.4.26。 2. 数据结构 每条规则是用结构struct ipt_entry来定义的: /* include/linux/netfilter_ipv4/ip_tables.h */ struct ipt_entry { struct ipt_ip ip; /* Mark with fields that we care about. */ unsigned int nfcache; /* Size of ipt_entry + matches */ u_int16_t target_offset; /* Size of ipt_entry + matches + target */ u_int16_t next_offset; /* Back pointer */ unsigned int comefrom; /* Packet and byte counters. */ struct ipt_counters counters; /* The matches (if any), then the target. */ unsigned char elems[0]; }; 参数说明: struct ipt_ip ip:基本匹配项,包括协议、源地址/掩码、目的地址/掩码、进入网卡、出网卡等 unsigned int nfcache:标志项 u_int16_t target_offset:规则动作的偏移位置 u_int16_t next_offset:下一个规则的偏移位置 unsigned int comefrom:规则返回点 struct ipt_counters counters:计数器 unsigned char elems[0]:规则匹配项表,最后是动作项 ipt_ip结构: struct ipt_ip { /* Source and destination IP addr */ struct in_addr src, dst; /* Mask for src and dest IP addr */ struct in_addr smsk, dmsk; char iniface[IFNAMSIZ], outiface[IFNAMSIZ]; unsigned char iniface_mask[IFNAMSIZ], outiface_mask[IFNAMSIZ]; /* Protocol, 0 = ANY */ u_int16_t proto; /* Flags word */ u_int8_t flags; /* Inverse flags */ u_int8_t invflags; }; 规则中的匹配项结构,注意这不是描述匹配的结构struct ipt_match struct ipt_entry_match { union { // 这是用户空间(iptables)用到的部分,只提供名称即可 struct { u_int16_t match_size; /* Used by userspace */ char name[IPT_FUNCTION_MAXNAMELEN]; } user; // 这是内核空间用到的部分,指向具体的匹配模块 struct { u_int16_t match_size; /* Used inside the kernel */ struct ipt_match *match; } kernel; /* Total length */ u_int16_t match_size; } u; unsigned char data[0]; }; 规则中的目标(规则动作)项结构,注意这不是描述目标的结构struct ipt_target struct ipt_entry_target { union { // 这是用户空间(iptables)用到的部分,只提供名称即可 struct { u_int16_t target_size; /* Used by userspace */ char name[IPT_FUNCTION_MAXNAMELEN]; } user; // 这是内核空间用到的部分,指向具体的目标模块 struct { u_int16_t target_size; /* Used inside the kernel */ struct ipt_target *target; } kernel; /* Total length */ u_int16_t target_size; } u; unsigned char data[0]; }; 3. 规则集操作函数 netfilter处理规则处理基本函数为ipt_do_table(),在filter/mangle表最终都要进入该函数,而nat表只对 NEW/RELATED的包进入该函数。该函数遍历所定义的规则集,顺次进行匹配,一旦和规则的条件匹配成功,则按规则指定的动作返回,返回值可能为 NF_ACCEPT/NF_DROP/NF_QUEUE/NF_STOLEN等。 /* net/ipv4/netfilter/ip_tables.c */ unsigned int ipt_do_table(struct sk_buff **pskb, unsigned int hook, const struct net_device *in, const struct net_device *out, struct ipt_table *table, void *userdata) { static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))) = { 0 }; u_int16_t offset; struct iphdr *ip; void *protohdr; u_int16_t datalen; int hotdrop = 0; /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; void *table_base; struct ipt_entry *e, *back; /* Initialization */ ip = (*pskb)->nh.iph; protohdr = (u_int32_t *)ip + ip->ihl; datalen = (*pskb)->len - ip->ihl * 4; // 如果数据包的进入网卡或出网卡为NULL,则在规则匹配时用nulldevname代替 indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ offset = ntohs(ip->frag_off) & IP_OFFSET; read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); // 找到规则集起点,每个表可在不同的挂接点定义规则集,但所有规则集都是统一 // 在一个数值里的 table_base = (void *)table->private->entries + TABLE_OFFSET(table->private, cpu_number_map(smp_processor_id())); // 第一个规则 e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG /* Check noone else using our table */ if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", smp_processor_id(), table->name, &((struct ipt_entry *)table_base)->comefrom, ((struct ipt_entry *)table_base)->comefrom); } ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; #endif // 规则集的最后一条规则,最后一条规则是链的缺省动作,不是全接收就是全部拒绝 /* For return from builtin chain */ back = get_entry(table_base, table->private->underflow[hook]); // 这是个死循环,因为最后一条规则是链的缺省动作,不是全接收就是全部拒绝 // 是能够跳出的,除非发生意外 do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); (*pskb)->nfcache |= e->nfcache; // 进行基本元素(struct ipt_ip中定义的元素)的匹配,符合再进行后续匹配 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { struct ipt_entry_target *t; // 循环匹配规则中独立的匹配条件 if (IPT_MATCH_ITERATE(e, do_match, *pskb, in, out, offset, protohdr, datalen, &hotdrop) != 0) goto no_match; // 全部条件匹配,计数器增加 ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); // 获取规则目标 t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); /* Standard target? */ if (!t->u.kernel.target->target) { int v; // 标准目标,正常情况v值是小于0的,如ACCEPT实际对于-NF_ACCEPT-1, // DROP对应-NF_DROP-1,都是小于0的数 v = ((struct ipt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != IPT_RETURN) { // verdict重新计算回正常值 verdict = (unsigned)(-v) - 1; break; } // 对于IPT_RETURN,返回原来的链重新继续循环 e = back; back = get_entry(table_base, back->comefrom); continue; } if (table_base + v != (void *)e + e->next_offset) { /* Save old back ptr in next entry */ struct ipt_entry *next = (void *)e + e->next_offset; next->comefrom = (void *)back - table_base; /* set back pointer to next entry */ back = next; } e = get_entry(table_base, v); } else { // 规则目标非标准目标,而是单独定义的目标模块 /* Targets which reenter must return abs. verdicts */ #ifdef CONFIG_NETFILTER_DEBUG ((struct ipt_entry *)table_base)->comefrom = 0xeeeeeeec; #endif // 调用目标模块的target()函数 verdict = t->u.kernel.target->target(pskb, hook, in, out, t->data, userdata); #ifdef CONFIG_NETFILTER_DEBUG if (((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec && verdict == IPT_CONTINUE) { printk("Target %s reentered!\n", t->u.kernel.target->name); verdict = NF_DROP; } ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; #endif // 目标有可能修改数据包的各种信息,数据包本身也可能不再是原来的包而是拷贝 // 后的包,因此关于包的网络参数需要重新识别 /* Target might have changed stuff. */ ip = (*pskb)->nh.iph; protohdr = (u_int32_t *)ip + ip->ihl; datalen = (*pskb)->len - ip->ihl * 4; if (verdict == IPT_CONTINUE) // 返回IPT_CONTINUE时继续下一条规则的检查 // 注意不支持IPT_RETURN e = (void *)e + e->next_offset; else /* Verdict */ break; } } else { // 规则不匹配,找下一条规则继续 no_match: e = (void *)e + e->next_offset; } // 匹配模块中有hotdrop参数,允许匹配模块丢包,而通常匹配模块是不丢包的 } while (!hotdrop); #ifdef CONFIG_NETFILTER_DEBUG ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; #endif read_unlock_bh(&table->lock); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else if (hotdrop) return NF_DROP; else return verdict; #endif } 4. 规则的修改 netfilter本质上是以数组方法保存规则集的,虽然每条规则的大小可能是不同的,因此在编辑规则时实际上操作比较麻烦的,对于 iptables的各种编辑规则的命令,实际上都是替换操作:IPT_SO_SET_REPLACE,对应的处理函数为do_replace()。 /* net/ipv4/netfilter/ip_tables.c */ static int do_replace(void *user, unsigned int len) { int ret; struct ipt_replace tmp; struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; // 先从用户空间拷贝规则集的描述信息,由结构struct ipt_replace描述 if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; // 长度检查 /* Hack: Causes ipchains to give correct error msg --RR */ if (len != sizeof(tmp) + tmp.size) return -ENOPROTOOPT; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; // 分配实际的规则集内存空间,每个CPU一个 newinfo = vmalloc(sizeof(struct ipt_table_info) + SMP_ALIGN(tmp.size) * smp_num_cpus); if (!newinfo) return -ENOMEM; if (copy_from_user(newinfo->entries, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } // 分配老规则集的计数器空间准备返回给用户空间 counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); if (!counters) { ret = -ENOMEM; goto free_newinfo; } memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); // 转换规则,检查规则的合法性等操作 ret = translate_table(tmp.name, tmp.valid_hooks, newinfo, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; duprintf("ip_tables: Translated table\n"); // 找到相应的ipt_table表 t = find_table_lock(tmp.name, &ret, &ipt_mutex); if (!t) goto free_newinfo_counters_untrans; /* You lied! */ if (tmp.valid_hooks != t->valid_hooks) { duprintf("Valid hook crap: %08X vs %08X\n", tmp.valid_hooks, t->valid_hooks); ret = -EINVAL; goto free_newinfo_counters_untrans_unlock; } // 将新的规则集替换原来的规则集 oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); if (!oldinfo) goto free_newinfo_counters_untrans_unlock; /* Update module usage count based on number of rules */ duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", oldinfo->number, oldinfo->initial_entries, newinfo->number); if (t->me && (oldinfo->number <= oldinfo->initial_entries) && (newinfo->number > oldinfo->initial_entries)) __MOD_INC_USE_COUNT(t->me); else if (t->me && (oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) __MOD_DEC_USE_COUNT(t->me); /* Get the old counters. */ // 读取老规则集的计数器 get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ // 遍历清除老规则集,调用规则中匹配和目标模块的destroy()函数 IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); // 释放老规则集 vfree(oldinfo); /* Silent error: too late now. */ // 将计数器拷贝回用户空间 copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters); // 将老计数器释放 vfree(counters); up(&ipt_mutex); return 0; free_newinfo_counters_untrans_unlock: up(&ipt_mutex); free_newinfo_counters_untrans: IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: vfree(newinfo); return ret; } 处理过程中比较重要的连接函数为translate_table()和replace_table(),也都在ip_tables.c中定义: static int translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, unsigned int size, unsigned int number, const unsigned int *hook_entries, const unsigned int *underflows) { unsigned int i; int ret; newinfo->size = size; newinfo->number = number; /* Init all hooks to impossible value. */ for (i = 0; i < NF_IP_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ // 检查规则集中规则的合法性,检查偏移是否正确 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, check_entry_size_and_hooks, newinfo, newinfo->entries, newinfo->entries + size, hook_entries, underflows, &i); if (ret != 0) return ret; if (i != number) { duprintf("translate_table: %u not %u entries\n", i, number); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_IP_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(valid_hooks & (1 << i))) continue; // 检查是否在合法hook点没有设置规则,在每个合法hook点是必须有规则的 if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", i, hook_entries[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", i, underflows[i]); return -EINVAL; } } // 检查自定义的链是否形成环 if (!mark_source_chains(newinfo, valid_hooks)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; // 遍历规则,通过调用匹配和目标的checkentry()函数检查其合法性 ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, check_entry, name, size, &i); if (ret != 0) { IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, &i); return ret; } // 规则集是每个CPU都有一个 /* And one copy for every other CPU */ for (i = 1; i < smp_num_cpus; i++) { memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, newinfo->entries, SMP_ALIGN(newinfo->size)); } return ret; } static struct ipt_table_info * replace_table(struct ipt_table *table, unsigned int num_counters, struct ipt_table_info *newinfo, int *error) { struct ipt_table_info *oldinfo; #ifdef CONFIG_NETFILTER_DEBUG { struct ipt_entry *table_base; unsigned int i; for (i = 0; i < smp_num_cpus; i++) { table_base = (void *)newinfo->entries + TABLE_OFFSET(newinfo, i); table_base->comefrom = 0xdead57ac; } } #endif /* Do the substitution. */ write_lock_bh(&table->lock); /* Check inside lock: is the old number correct? */ if (num_counters != table->private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, table->private->number); write_unlock_bh(&table->lock); *error = -EAGAIN; return NULL; } // struct ipt_table结构中的private指向规则集 // 获取老规则集地址指针 oldinfo = table->private; // 指向新规则集 table->private = newinfo; newinfo->initial_entries = oldinfo->initial_entries; write_unlock_bh(&table->lock); return oldinfo; } 5. 结论 netfilter的规则是数组方式顺序保存,但每个元素(规则)的大小是不同的,每条规则除了基本部分相同外,还包括不同数量的匹配和目标项。规则匹配是顺序匹配,而编辑时实际上是将整个规则集全部替换。