Flower Classifier mellanox flow_indr_block_cb_register TC_SETUP_BLOCK TC_SETUP_CLSFLOWER fl_classify tcf_proto_ops cls_fl_ops cls_bpf bpf action Linux flow offload提高路由转发效率
7.TC routine: htb
qdisc_enqueue_root(sch_generic.h) -> qdisc_enqueue(sch_generic.h) -> htb_enqueue(sch_htb.c) ->htb_classify(sch_htb.c) -> flow_classify(cls_flow.c) -> tcf_exts_exec(pkt_cls.h) ->tcf_action_exec(act_api.c) -> tcf_act_police(act_police.c)
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c:837: err = __flow_indr_block_cb_register(netdev, rpriv, include/net/flow_offload.h:396:int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, include/net/flow_offload.h:404:int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
Tc Flower allows to specify three operating modes: skip_sw, skip_hw and not specified.
In the skip_sw mode TC Flower tries to offload the rule to the NIC driver. If this operation fails the rule will not be added. I
n the skip_hw mode TC Flower ignores completely the underling hardware and sets the rules in software.
If we do not specify the operating mode, TC Flower first tries to put the rule in hardware and if the operation fails tries to allocate it in software.
if (f && !tc_skip_sw(f->flags)) { //硬件offload不调用 *res = f->res; return tcf_exts_exec(skb, &f->exts, res); }
static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; }
net/sched/cls_bpf.c:422: ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true, net/sched/cls_flower.c没有调用tcf_exts_validate tcf_exts_validate tcf_action_init tcf_action_init_1
TC_SETUP_BLOCK TC_SETUP_CLSFLOWER
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); tc_setup_cb_add(block, tp, TC_SETUP_CLSFLOWER, &cls_flower, skip_sw, &f->flags, &f->in_hw_count, rtnl_held);
struct tcf_exts
struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. */ int action; int police; };
struct tc_action { // 私有数据 void *priv; // 操作结构 struct tc_action_ops *ops; // 类型 __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ // 阶数 __u32 order; // 动作链表下一项 struct tc_action *next; }; #define TCA_CAP_NONE 0 // action操作结构, 实际就是定义目标操作, 通常每个匹配操作都由一个静态tcf_action_ops // 结构定义, 作为一个内核模块, 初始化事登记系统的链表 struct tc_action_ops { // 链表中的下一项 struct tc_action_ops *next; struct tcf_hashinfo *hinfo; // 名称 char kind[IFNAMSIZ]; __u32 type; /* TBD to match kind */ __u32 capab; /* capabilities includes 4 bit version */ struct module *owner; // 动作 int (*act)(struct sk_buff *, struct tc_action *, struct tcf_result *); // 获取统计参数 int (*get_stats)(struct sk_buff *, struct tc_action *); // 输出 int (*dump)(struct sk_buff *, struct tc_action *, int, int); // 清除 int (*cleanup)(struct tc_action *, int bind); // 查找 int (*lookup)(struct tc_action *, u32); // 初始化 int (*init)(struct rtattr *, struct rtattr *, struct tc_action *, int , int); // 遍历 int (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *); };
/* net/sched/cls_flower.c Flower classifier * * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ 11 linux/kernel.h> linux/init.h> linux/module.h> linux/rhashtable.h> 16 linux/if_ether.h> linux/in6.h> linux/ip.h> 20 net/sch_generic.h> net/pkt_cls.h> net/ip.h> net/flow_dissector.h> 25 fl_flow_key { indev_ifindex; control; basic; eth; ipaddrs; 32 union { ipv4; ipv6; 35 }; tp; /* Ensure that we can do comparisons as longs. */ 38 fl_flow_mask_range { start; end; 42}; 43 fl_flow_mask { key; range; rcu; 48}; 49 cls_fl_head { ht; mask; dissector; hgen; mask_assigned; filters; ht_params; rcu; 59}; 60 cls_fl_filter { ht_node; mkey; exts; res; key; list; handle; flags; rcu; 71}; 72 mask) 74{ start; 76} 77 mask) 79{ key; key); size - 1; 83 i++) { i]) { i) i; i; 89 } 90 } first, sizeof(long)); last + 1, sizeof(long)); 93} 94 key, mask) 97{ start; 99} 100 key, mask) 103{ mask); mask); mask); i; 108 i += sizeof(long)) lmask++; 111} 112 key, mask) 115{ mask)); 117} 118
tcf_exts_exec
- TCA_CLS_FLAGS_SKIP_HW:只在软件(系统内核TC模块)添加规则,不在硬件添加。如果规则不能添加则报错。
- TCA_CLS_FLAGS_SKIP_SW:只在硬件(规则挂载的网卡)添加规则,不在软件添加。如果规则不能添加则报错。
- 默认(不带标志位):尝试同时在硬件和软件下载规则,如果规则不能在软件添加则报错。
通过TC命令查看规则,如果规则已经卸载到硬件了,可以看到 in_hw标志位。
tp,
res)
121{
root);
f;
skb_key;
skb_mkey;
126
nelems))
128 return -1;
129
mask);
skb_iif;
/* skb_flow_dissect() does not set n_proto in case an unknown protocol,
* so do it rather here.
*/
protocol;
skb_key, 0);
137
mask);
139
ht,
mask),
ht_params);
flags)) {
res;
res);
146 }
147 return -1;
148}
tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, struct tcf_result *res) { if (exts->nr_actions) return tcf_action_exec(skb, exts->actions, exts->nr_actions, res); return 0; } int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { int ret = -1, i; if (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); ret = TC_ACT_OK; goto exec_done; } for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; repeat: ret = a->ops->act(skb, a, res); if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) goto exec_done; } exec_done: return ret; }
net/sched/cls_flower.c 初始化
tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
General informations
The Linux kernel configuration item CONFIG_MLX5_CLS_ACT
:
- prompt: MLX5 TC classifier action support
- type: bool
- depends on:
CONFIG_MLX5_ESWITCH && CONFIG_NET_CLS_ACT
- defined in drivers/net/ethernet/mellanox/mlx5/core/Kconfig
- found in Linux kernels: 5.8–5.11, 5.11+HEAD
前动作为policce 应的回调为tcf_act_police,下面单独分析
tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
net/sched/cls_flow.c:441: err = tcf_exts_init(&fnew->exts, net, TCA_FLOW_ACT, TCA_FLOW_POLICE);
et/sched/cls_flower.c:1577: err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0);
static inline int tcf_exts_init(struct tcf_exts *exts, struct net *net, int action, int police) { #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; // 不执行tcf_action_exec exts->net = net; exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); if (!exts->actions) return -ENOMEM; #endif exts->action = action; exts->police = police; return 0; }
fl_classify
tcf_classify --> fl_classify
static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; list_for_each_entry_rcu(mask, &head->masks, list) { flow_dissector_init_keys(&skb_key.control, &skb_key.basic); fl_clear_masked_range(&skb_key, mask); skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ skb_key.basic.n_proto = skb_protocol(skb, false); skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, mask); f = fl_lookup(mask, &skb_mkey, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); } } return -1; }
static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, u32 prio, struct tcf_chain *chain, bool rtnl_held, struct netlink_ext_ack *extack) { struct tcf_proto *tp; int err; tp = kzalloc(sizeof(*tp), GFP_KERNEL); if (!tp) return ERR_PTR(-ENOBUFS); tp->ops = tcf_proto_lookup_ops(kind, rtnl_held, extack); if (IS_ERR(tp->ops)) { err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; tp->protocol = protocol; tp->prio = prio; tp->chain = chain; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); err = tp->ops->init(tp); if (err) { module_put(tp->ops->owner); goto errout; } return tp; errout: kfree(tp); return ERR_PTR(err); }
149
tp)
151{
head;
153
GFP_KERNEL);
head)
ENOBUFS;
157
filters);
head);
160
161 return 0;
162}
163
head)
165{
rcu);
167
exts);
f);
170}
171
cookie)
173{
dev;
offload = {0};
tc;
177
tp, 0))
179 return;
180
TC_CLSFLOWER_DESTROY;
cookie;
183
TC_SETUP_CLSFLOWER;
offload;
186
tc);
188}
189
tp,
dissector,
mask,
key,
actions,
flags)
196{
dev;
offload = {0};
tc;
err;
201
flags))
EINVAL : 0;
204
TC_CLSFLOWER_REPLACE;
cookie;
dissector;
mask;
key;
actions;
211
TC_SETUP_CLSFLOWER;
offload;
214
tc);
216
flags))
err;
219
220 return 0;
221}
222
f)
224{
dev;
offload = {0};
tc;
228
tp, 0))
230 return;
231
TC_CLSFLOWER_STATS;
f;
exts;
235
TC_SETUP_CLSFLOWER;
offload;
238
tc);
240}
241
force)
243{
root);
next;
246
filters))
false;
249
list) {
f);
list);
fl_destroy_filter);
254 }
NULL);
mask_assigned)
ht);
rcu);
true;
260}
261
handle)
263{
root);
f;
266
list)
handle)
f;
270 return 0;
271}
272
TCA_FLOWER_MAX + 1] = {
NLA_UNSPEC },
NLA_U32 },
NLA_STRING,
IFNAMSIZ },
ETH_ALEN },
ETH_ALEN },
ETH_ALEN },
ETH_ALEN },
NLA_U16 },
NLA_U8 },
NLA_U32 },
NLA_U32 },
NLA_U32 },
NLA_U32 },
in6_addr) },
in6_addr) },
in6_addr) },
in6_addr) },
NLA_U16 },
NLA_U16 },
NLA_U16 },
NLA_U16 },
296};
297
tb,
val_type,
len)
301{
val_type])
303 return;
len);
mask_type])
len);
307 else
len);
309}
310
tb,
mask)
313{
CONFIG_NET_CLS_IND
TCA_FLOWER_INDEV]) {
TCA_FLOWER_INDEV]);
err < 0)
err;
err;
indev_ifindex = 0xffffffff;
321 }
322#endif
323
TCA_FLOWER_KEY_ETH_DST,
TCA_FLOWER_KEY_ETH_DST_MASK,
dst));
TCA_FLOWER_KEY_ETH_SRC,
TCA_FLOWER_KEY_ETH_SRC_MASK,
src));
330
TCA_FLOWER_KEY_ETH_TYPE,
TCA_FLOWER_UNSPEC,
n_proto));
334
ETH_P_IP) ||
ETH_P_IPV6)) {
TCA_FLOWER_KEY_IP_PROTO,
TCA_FLOWER_UNSPEC,
ip_proto));
340 }
341
TCA_FLOWER_KEY_IPV4_DST]) {
FLOW_DISSECTOR_KEY_IPV4_ADDRS;
TCA_FLOWER_KEY_IPV4_SRC,
TCA_FLOWER_KEY_IPV4_SRC_MASK,
src));
TCA_FLOWER_KEY_IPV4_DST,
TCA_FLOWER_KEY_IPV4_DST_MASK,
dst));
TCA_FLOWER_KEY_IPV6_DST]) {
FLOW_DISSECTOR_KEY_IPV6_ADDRS;
TCA_FLOWER_KEY_IPV6_SRC,
TCA_FLOWER_KEY_IPV6_SRC_MASK,
src));
TCA_FLOWER_KEY_IPV6_DST,
TCA_FLOWER_KEY_IPV6_DST_MASK,
dst));
358 }
359
IPPROTO_TCP) {
TCA_FLOWER_KEY_TCP_SRC,
TCA_FLOWER_UNSPEC,
src));
TCA_FLOWER_KEY_TCP_DST,
TCA_FLOWER_UNSPEC,
dst));
IPPROTO_UDP) {
TCA_FLOWER_KEY_UDP_SRC,
TCA_FLOWER_UNSPEC,
src));
TCA_FLOWER_KEY_UDP_DST,
TCA_FLOWER_UNSPEC,
dst));
374 }
375
376 return 0;
377}
378
mask1,
mask2)
381{
mask1);
mask2);
384
range)) &&
mask1));
387}
388
fl_ht_params = {
/* base offset */
ht_node),
true,
393};
394
head,
mask)
397{
fl_ht_params;
mask);
start;
401
ht_params);
403}
404
member)
member))
member)
member))
409
member)
end &&
start)
413
member)
415 do {
id;
member);
cnt++;
419 } while(0);
420
member)
422 do {
member))
member);
425 } while(0);
426
head,
mask)
429{
FLOW_DISSECTOR_KEY_MAX];
cnt = 0;
432
control);
basic);
cnt,
eth);
cnt,
ipv4);
cnt,
ipv6);
cnt,
tp);
443
cnt);
445}
446
head,
mask)
449{
err;
451
mask_assigned) {
mask))
EINVAL;
455 else
456 return 0;
457 }
458
/* Mask is not assigned yet. So assign it and init hashtable
* according to that.
*/
mask);
err)
err;
mask));
true;
467
mask);
469
470 return 0;
471}
472
tp,
mask,
tb,
ovr)
477{
e;
err;
480
TCA_FLOWER_ACT, 0);
ovr);
err < 0)
err;
485
TCA_FLOWER_CLASSID]) {
TCA_FLOWER_CLASSID]);
base);
489 }
490
key);
err)
errout;
494
mask);
mask);
497
e);
499
500 return 0;
errout:
e);
err;
504}
505
tp,
head)
508{
i = 0x80000000;
handle;
511
512 do {
hgen == 0x7FFFFFFF)
hgen = 1;
hgen));
516
i == 0)) {
handle = 0;
520 } else {
hgen;
522 }
523
handle;
525}
526
in_skb,
base,
tca,
ovr)
531{
root);
arg;
fnew;
TCA_FLOWER_MAX + 1];
mask = {};
err;
538
TCA_OPTIONS])
EINVAL;
541
fl_policy);
err < 0)
err;
545
handle)
EINVAL;
548
GFP_KERNEL);
fnew)
ENOBUFS;
552
TCA_FLOWER_ACT, 0);
554
handle) {
head);
handle) {
EINVAL;
errout;
560 }
561 }
handle;
563
TCA_FLOWER_FLAGS]) {
TCA_FLOWER_FLAGS]);
566
flags)) {
EINVAL;
errout;
570 }
571 }
572
ovr);
err)
errout;
576
mask);
err)
errout;
580
flags)) {
ht_node,
ht_params);
err)
errout;
586 }
587 //调用硬件offload
tp,
dissector,
key,
key,
exts,
fnew,
flags);
err)
errout;
597
fold) {
ht_node,
ht_params);
fold);
602 }
603
fnew;
605
fold) {
list);
res);
fl_destroy_filter);
610 } else {
filters);
612 }
613
614 return 0;
615
errout:
fnew);
err;
619}
620
arg)
622{
root);
arg;
625
ht_node,
ht_params);
list);
f);
res);
fl_destroy_filter);
632 return 0;
633}
634
arg)
636{
root);
f;
639
list) {
skip)
skip;
arg) < 0) {
stop = 1;
645 break;
646 }
skip:
count++;
649 }
650}
651
skb,
val_type,
len)
655{
err;
657
len))
659 return 0;
val);
err)
err;
TCA_FLOWER_UNSPEC) {
mask);
err)
err;
667 }
668 return 0;
669}
670
fh,
t)
673{
root);
fh;
nest;
mask;
678
f)
len;
681
handle;
683
TCA_OPTIONS);
nest)
nla_put_failure;
687
classid &&
classid))
nla_put_failure;
691
key;
key;
694
indev_ifindex) {
dev;
697
indev_ifindex);
name))
nla_put_failure;
701 }
702
f);
704
TCA_FLOWER_KEY_ETH_DST,
TCA_FLOWER_KEY_ETH_DST_MASK,
dst)) ||
TCA_FLOWER_KEY_ETH_SRC,
TCA_FLOWER_KEY_ETH_SRC_MASK,
src)) ||
TCA_FLOWER_KEY_ETH_TYPE,
TCA_FLOWER_UNSPEC,
n_proto)))
nla_put_failure;
ETH_P_IP) ||
ETH_P_IPV6)) &&
TCA_FLOWER_KEY_IP_PROTO,
TCA_FLOWER_UNSPEC,
ip_proto)))
nla_put_failure;
721
FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
TCA_FLOWER_KEY_IPV4_SRC,
TCA_FLOWER_KEY_IPV4_SRC_MASK,
src)) ||
TCA_FLOWER_KEY_IPV4_DST,
TCA_FLOWER_KEY_IPV4_DST_MASK,
dst))))
nla_put_failure;
FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
TCA_FLOWER_KEY_IPV6_SRC,
TCA_FLOWER_KEY_IPV6_SRC_MASK,
src)) ||
TCA_FLOWER_KEY_IPV6_DST,
TCA_FLOWER_KEY_IPV6_DST_MASK,
dst))))
nla_put_failure;
738
IPPROTO_TCP &&
TCA_FLOWER_KEY_TCP_SRC,
TCA_FLOWER_UNSPEC,
src)) ||
TCA_FLOWER_KEY_TCP_DST,
TCA_FLOWER_UNSPEC,
dst))))
nla_put_failure;
IPPROTO_UDP &&
TCA_FLOWER_KEY_UDP_SRC,
TCA_FLOWER_UNSPEC,
src)) ||
TCA_FLOWER_KEY_UDP_DST,
TCA_FLOWER_UNSPEC,
dst))))
nla_put_failure;
755
flags);
757
exts))
nla_put_failure;
760
nest);
762
exts) < 0)
nla_put_failure;
765
len;
767
nla_put_failure:
nest);
770 return -1;
771}
772
tcf_proto_ops cls_fl_ops
__read_mostly = {
fl_classify,
fl_init,
fl_destroy,
fl_get,
fl_change,
fl_delete,
fl_walk,
fl_dump,
THIS_MODULE,
784};
785
cls_fl_init(void)
787{
cls_fl_ops);
789}
790
cls_fl_exit(void)
792{
cls_fl_ops);
794}
795
cls_fl_init);
cls_fl_exit);
798
802
cls_bpf
static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, .classify = cls_bpf_classify, .init = cls_bpf_init, .destroy = cls_bpf_destroy, .get = cls_bpf_get, .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, };
static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; int ret; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy, NULL); if (ret < 0) return ret; prog = kzalloc(sizeof(*prog), GFP_KERNEL); if (!prog) return -ENOBUFS; ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { ret = -EINVAL; goto errout; } } if (handle == 0) prog->handle = cls_bpf_grab_new_handle(tp, head); else prog->handle = handle; if (prog->handle == 0) { ret = -EINVAL; goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) goto errout; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { __cls_bpf_delete_prog(prog); return ret; } if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); } else { list_add_rcu(&prog->link, &head->plist); } *arg = prog; return 0; errout: tcf_exts_destroy(&prog->exts); kfree(prog); return ret; }
bpf action
static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, .size = sizeof(struct tcf_bpf), }; static __net_init int bpf_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, bpf_net_id); return tc_action_net_init(net, tn, &act_bpf_ops); }
static int __init bpf_init_module(void) { return tcf_register_action(&act_bpf_ops, &bpf_net_ops); }
Linux flow offload提高路由转发效率
https://blog.csdn.net/dog250/article/details/103422860