linux bridge在虚拟化场景和docker中用的比较多,之前也知道它的原理,基本上就是类似二层交换机,根据mac地址和vid转发。但是对于vlan的处理网上的文档比较少,所以这次就看一下源码,分析下不配置vlan时如何转发,vlan又如何生效。
不配置vlan时,bridge纯靠mac转发,可通过如下两个命令之一查看mac转发表
//此命令只显示单播转发表,比较符合硬件交换机的显示规范,
//匹配到mac的,从port转发出去(可通过brctl showbsp br1查看端
//口号和端口的对应关系)
root@node2:~# brctl showmacs br1
port no mac addr is local? ageing timer
2 12:27:96:8c:f4:58 yes 0.00
2 12:27:96:8c:f4:58 yes 0.00
1 66:e6:6f:a8:d4:97 yes 0.00
1 66:e6:6f:a8:d4:97 yes 0.00
//通过此命令可显示所有的单播和组播表项
root@node2:~# bridge fdb show br br1
33:33:00:00:00:01 dev br1 self permanent
66:e6:6f:a8:d4:97 dev vetha master br1 permanent
66:e6:6f:a8:d4:97 dev vetha vlan 1 master br1 permanent
33:33:00:00:00:01 dev vetha self permanent
01:00:5e:00:00:01 dev vetha self permanent
12:27:96:8c:f4:58 dev vethx master br1 permanent
12:27:96:8c:f4:58 dev vethx vlan 1 master br1 permanent
33:33:00:00:00:01 dev vethx self permanent
01:00:5e:00:00:01 dev vethx self permanent
这篇文档就先介绍不使能vlan的情况,主要分为下面几个部分 a. kernel端bridge module的初始化都做了哪些事 b. 添加网桥时,命令行和kernel端代码流程 c. 给网桥添加端口时,命令行和kernel端代码流程 d. 从端口收到报文后,内部是如何转发的
广播/组播/未知单播报文flood到所有端口。
查找到转发表项的已知单播报文,发送到此表项的出端口。
广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。
e. 从网桥br发出去的报文如何转发
广播/组播/未知单播报文,flood到所有端口。
能查找到转发表项的单播报文,从表项的出端口发送出去。
bridge还有如下几个注意的地方
单播flood: 控制单播报文是否从此端口发送一份,有两种设置方式,
a. bridge link set dev vnet1 flood on
b. echo 1 > /sys/class/net/br1/brif/vnet1/unicast_flood
hairpin模式:控制接收到广播/组播/未知单播的端口,再次从此端口发出。已知单播正常转发。
a. bridge link set dev vnet1 hairpin on
b. echo 1 > /sys/class/net/br1/brif/vnet1/hairpin_mode
网桥设备down后,所有端口状态都会变成 disabled, 导致网桥不会正确转发。
vetha (1)
port id 8001 state disabled
designated root 8000.3adce07c2043 path cost 2
designated bridge 8000.3adce07c2043 message age timer 0.00
designated port 8001 forward delay timer 0.00
designated cost 0 hold timer 0.00
flags
bridge netfilter框架,可使用ebtables设置和查看
image.png
1. module初始化流程
#module初始化流程
module_init(br_init)
static int __init br_init(void)
static const struct stp_proto br_stp_proto = {
.rcv = br_stp_rcv,
};
//注册stp协议处理函数,防止环路产生,此文不看stp部分
stp_proto_register(&br_stp_proto);
//初始化fdb表项用到的cache
br_fdb_init();
static struct kmem_cache *br_fdb_cache __read_mostly;
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
sizeof(struct net_bridge_fdb_entry),0,
SLAB_HWCACHE_ALIGN, NULL);
static u32 fdb_salt __read_mostly;
get_random_bytes(&fdb_salt, sizeof(fdb_salt));
static struct pernet_operations br_net_ops = {
.exit = br_net_exit,
};
//注册pernet操作,只提供了exit,所以namespace初始化时无操作
register_pernet_subsys(&br_net_ops);
static struct notifier_block br_device_notifier = {
.notifier_call = br_device_event
};
//注册网络设备事件处理函数
register_netdevice_notifier(&br_device_notifier);
br_netlink_init();
br_mdb_init();
rtnl_register(PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, NULL);
rtnl_register(PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, NULL);
rtnl_register(PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, NULL);
static struct rtnl_af_ops br_af_ops = {
.family = AF_BRIDGE,
.get_link_af_size = br_get_link_af_size,
};
rtnl_af_register(&br_af_ops);
list_add_tail(&ops->list, &rtnl_af_ops);
struct rtnl_link_ops br_link_ops __read_mostly = {
.kind = "bridge",
.priv_size = sizeof(struct net_bridge),
.setup = br_dev_setup,
.maxtype = IFLA_BRPORT_MAX,
.policy = br_policy,
.validate = br_validate,
.newlink = br_dev_newlink,
.changelink = br_changelink,
.dellink = br_dev_delete,
.get_size = br_get_size,
.fill_info = br_fill_info,
.slave_maxtype = IFLA_BRPORT_MAX,
.slave_policy = br_port_policy,
.slave_changelink = br_port_slave_changelink,
.get_slave_size = br_port_get_slave_size,
.fill_slave_info = br_port_fill_slave_info,
};
rtnl_link_register(&br_link_ops);
__rtnl_link_register(ops);
list_add_tail(&ops->list, &link_ops);
//注册hook函数到br_ioctl_hook,添加网桥时调用br_ioctl_hook
brioctl_set(br_ioctl_deviceless_stub);
static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
br_ioctl_hook = hook;
2. 创建/删除桥流程
通过strace brctl命令,可知创建/删除桥是通过socket的ioctl调用到kernel端
//添加桥
root@node2:~# strace brctl addbr br1
execve("/usr/sbin/brctl", ["brctl", "addbr", "br1"], 0x7fffd27c39a0 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0) = 3
ioctl(3, SIOCBRADDBR, "br1") = 0
//删除桥
root@node2:~# strace brctl delbr br1
execve("/usr/sbin/brctl", ["brctl", "delbr", "br1"], 0x7fff18eceaa0 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0) = 3
ioctl(3, SIOCBRDELBR, "br1") = 0
#kernel端代码,ioctl最终会调用 sock_ioctl
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
case SIOCBRADDBR:
case SIOCBRDELBR:
err = -ENOPKG;
if (!br_ioctl_hook)
request_module("bridge");
mutex_lock(&br_ioctl_mutex);
//调用之前注册的 br_ioctl_deviceless_stub
if (br_ioctl_hook)
err = br_ioctl_hook(net, cmd, argp); //br_ioctl_deviceless_stub
mutex_unlock(&br_ioctl_mutex);
break;
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf);
return br_del_bridge(net, buf);
}
}
return -EOPNOTSUPP;
}
int br_add_bridge(struct net *net, const char *name)
struct net_device *dev;
dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup);
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
alloc_size = sizeof(struct net_device);
struct net_device *p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
struct net_device *dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev_addr_init(dev)
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->gso_min_segs = 0;
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev); //br_dev_setup
struct net_bridge *br = netdev_priv(dev);
eth_hw_addr_random(dev);
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->dev_addr);
ether_setup(dev);
dev->header_ops = ð_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
dev->tx_queue_len = 1000; /* Ethernet wants good queues */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
memset(dev->broadcast, 0xFF, ETH_ALEN);
dev->netdev_ops = &br_netdev_ops;
dev->destructor = br_dev_free;
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
dev->tx_queue_len = 0;
dev->priv_flags = IFF_EBRIDGE;
dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
dev->vlan_features = COMMON_FEATURES;
br->dev = dev;
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
br->bridge_id.prio[1] = 0x00;
/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
ether_addr_copy(br->group_addr, eth_reserved_addr_base);
br->stp_enabled = BR_NO_STP;
br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
br->designated_root = br->bridge_id;
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
br->ageing_time = 300 * HZ;
br_netfilter_rtable_init(br);
struct rtable *rt = &br->fake_rtable;
atomic_set(&rt->dst.__refcnt, 1);
rt->dst.dev = br->dev;
rt->dst.path = &rt->dst;
dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
rt->dst.ops = &fake_dst_ops;
br_stp_timer_init(br);
setup_timer(&br->hello_timer, br_hello_timer_expired, (unsigned long) br);
setup_timer(&br->tcn_timer, br_tcn_timer_expired, (unsigned long) br);
setup_timer(&br->topology_change_timer,br_topology_change_timer_expired,(unsigned long) br);
setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
br_multicast_init(br);
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
netif_alloc_netdev_queues(dev)
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
netif_alloc_rx_queues(dev)
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
dev_net_set(dev, net);
dev->rtnl_link_ops = &br_link_ops;
register_netdev(dev);
register_netdevice(dev);
dev->netdev_ops->ndo_init(dev);//br_dev_init
struct net_bridge *br = netdev_priv(dev);
br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
/vlan相关初始化
br_vlan_init(br);
//支持的vlan协议,可以通过(/sys/class/net/br1/bridge/vlan_protocol)修改
br->vlan_proto = htons(ETH_P_8021Q);
//默认 pvid 为 1
br->default_pvid = 1;
//将vid 1和网桥mac添加到fdb中
br_vlan_add(br, 1, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED);
struct net_port_vlans *pv = NULL;
pv = rtnl_dereference(br->vlan_info);
if (pv)
return __vlan_add(pv, vid, flags);
pv = kzalloc(sizeof(*pv), GFP_KERNEL);
pv->parent.br = br;
__vlan_add(pv, vid, flags);
if (v->port_idx) {
p = v->parent.port;
br = p->br;
dev = p->dev;
} else {//出去网桥和网桥设备
br = v->parent.br;
dev = br->dev;
}
if (p) {
vlan_vid_add(dev, br->vlan_proto, vid);
vlan_info = rtnl_dereference(dev->vlan_info);
vid_info = vlan_vid_info_get(vlan_info, proto, vid);
if (!vid_info) {
__vlan_vid_add(vlan_info, proto, vid, &vid_info);
vid_info = vlan_vid_info_alloc(proto, vid);
//如果硬件支持vlan filter,则设置到硬件
if (vlan_hw_filter_capable(dev, vid_info)) {
ops->ndo_vlan_rx_add_vid(dev, proto, vid);
list_add(&vid_info->list, &vlan_info->vid_list);
vlan_info->nr_vids++;
*pvid_info = vid_info;
vid_info->refcount++;
//插入fdb表项
br_fdb_insert(br, p, dev->dev_addr, vid);
//设置到 vlan_bitmap 中
set_bit(vid, v->vlan_bitmap);
v->num_vlans++;
__vlan_add_flags(v, vid, flags);
rcu_assign_pointer(br->vlan_info, pv);
3. 添加/删除接口流程
#添加接口
root@node2:~# strace brctl addif br1 vetha
execve("/usr/sbin/brctl", ["brctl", "addif", "br1", "vetha"], 0x7fff20137ba8 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0) = 3
access("/proc/net", R_OK) = 0
access("/proc/net/unix", R_OK) = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
close(4) = 0
ioctl(3, SIOCBRADDIF) = 0
#删除接口
root@node2:~# strace brctl delif br1 vetha
execve("/usr/sbin/brctl", ["brctl", "delif", "br1", "vetha"], 0x7ffe8db2f1a8 /* 22 vars */) = 0
socket(AF_UNIX, SOCK_STREAM, 0) = 3
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vetha", }) = 0
close(4) = 0
ioctl(3, SIOCBRDELIF) = 0
static const struct net_device_ops br_netdev_ops = {
.ndo_do_ioctl = br_dev_ioctl,
...
.ndo_fix_features = br_fix_features,
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
};
#kernel端代码,添加接口
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch (cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
/* called with RTNL */
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
struct net *net = dev_net(br->dev);
struct net_device *dev;
int ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
dev = __dev_get_by_index(net, ifindex);
if (dev == NULL)
return -EINVAL;
if (isadd)
ret = br_add_if(br, dev);
else
ret = br_del_if(br, dev);
return ret;
}
int br_add_if(struct net_bridge *br, struct net_device *dev)
int err = 0;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices */
if ((dev->flags & IFF_LOOPBACK) ||
dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
!is_valid_ether_addr(dev->dev_addr))
return -EINVAL;
//bridge接口不能加入另一个bridge
/* No bridging of bridges */
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
//加入bridge的接口不能加入另一个bridge,即一个接口不能同时加到两个bridge
/* Device is already being bridged */
if (br_port_exists(dev)) //#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
if (dev->priv_flags & IFF_DONT_BRIDGE)
return -EOPNOTSUPP;
struct net_bridge_port *p;
p = new_nbp(br, dev);
//找到最小可用的端口号。0保留不用,最大端口号为1<<10
index = find_portno(br);
p = kzalloc(sizeof(*p), GFP_KERNEL);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
//保存端口号
p->port_no = index;
p->flags = BR_LEARNING | BR_FLOOD;
br_init_port(p);
//优先级左移10位或上port_no作为端口号
p->port_id = br_make_port_id(p->priority, p->port_no);
return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1));
br_become_designated_port(p);
struct net_bridge *br;
br = p->br;
p->designated_root = br->designated_root;
p->designated_cost = br->root_path_cost;
p->designated_bridge = br->bridge_id;
p->designated_port = p->port_id;
//初始状态为 BR_STATE_BLOCKING
br_set_state(p, BR_STATE_BLOCKING);
p->state = state;
p->topology_change_ack = 0;
p->config_pending = 0;
//设置状态为 BR_STATE_DISABLED
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
setup_timer(&p->message_age_timer, br_message_age_timer_expired, (unsigned long) p);
setup_timer(&p->forward_delay_timer, br_forward_delay_timer_expired, (unsigned long) p);
setup_timer(&p->hold_timer, br_hold_timer_expired, (unsigned long) p);
br_multicast_add_port(p);
call_netdevice_notifiers(NETDEV_JOIN, dev);
//使能组播
dev_set_allmulti(dev, 1);
__dev_set_allmulti(dev, inc, true);
dev->flags |= IFF_ALLMULTI;
dev->allmulti += inc;
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR);
//将接口信息添加到 sys 文件系统中:/sys/class/net/br1/brif/vnet1(桥br1上的接口vnet1)
br_sysfs_addif(p);
br_netpoll_enable(p);
//注册br_handle_frame到协议栈入口处
netdev_rx_handler_register(dev, br_handle_frame, p);
//设置flag IFF_BRIDGE_PORT,表示此接口已经加入桥
dev->priv_flags |= IFF_BRIDGE_PORT;
netdev_master_upper_dev_link(dev, br->dev);
//关闭 lro 功能
dev_disable_lro(dev);
//将接口加入桥的端口链表 br->port_list
list_add_rcu(&p->list, &br->port_list);
nbp_update_port_count(br);
list_for_each_entry(p, &br->port_list, list) {
//#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
//#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
//上面初始化时,p->flags = BR_LEARNING | BR_FLOOD,所以此处成立,cnt加1
if (br_auto_port(p))
cnt++;
}
if (br->auto_cnt != cnt) {
br->auto_cnt = cnt;
br_manage_promisc(br);
//如果bridge接口使能了混杂模式或者bridge接口没有使能vlan filter,则设置桥上所有接口使能混杂模式
if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br))
set_all = true;
list_for_each_entry(p, &br->port_list, list) {
if (set_all) {
br_port_set_promisc(p);
//使能接口混杂模式
dev_set_promiscuity(p->dev, 1);
//将fdb中静态表项从接口的单播地址列表删除
br_fdb_unsync_static(p->br, p);
for (i = 0; i < BR_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(fdb, &br->hash[i], hlist) {
/* We only care for static entries */
if (!fdb->is_static)
continue;
dev_uc_del(p->dev, fdb->addr.addr);
}
}
p->flags |= BR_PROMISC;
} else {
if (br->auto_cnt == 0 ||
(br->auto_cnt == 1 && br_auto_port(p)))
br_port_clear_promisc(p);
//如果接口已经不是混杂模式则返回
//或者接口不支持单播过滤,此时也返回,不用关闭混杂模式,因为不支持单播过滤的接口
//最终都会使能混杂模式
if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
return;
br_fdb_sync_static(p->br, p);
struct net_bridge_fdb_entry *fdb, *tmp;
//将fdb中静态表项添加到接口的单播地址列表
for (i = 0; i < BR_HASH_SIZE; i++) {
hlist_for_each_entry(fdb, &br->hash[i], hlist) {
/* We only care for static entries */
if (!fdb->is_static)
continue;
err = dev_uc_add(p->dev, fdb->addr.addr);
if (err)
goto rollback;
}
}
dev_set_promiscuity(p->dev, -1);
else
br_port_set_promisc(p);
}
}
netdev_update_features(br->dev);
//给fdb插入一个表项,vid为 0
br_fdb_insert(br, p, dev->dev_addr, 0)
fdb_insert(br, source, addr, vid);
struct hlist_head *head = &br->hash[br_mac_hash(addr, vid)];
struct net_bridge_fdb_entry *fdb;
//根据mac地址和vid查找是否已经存在
fdb = fdb_find(head, addr, vid);
if (fdb) {
/* it is okay to have multiple ports with same
* address, just use the first one.
*/
//已经存在相同的mac,如果已存在的也是local是允许的。
//使用已存在的即可。这样正在添加的接口就不能根据fdb转发了
if (fdb->is_local)
return 0;
br_warn(br, "adding interface %s with same address "
"as a received packet\n",
source ? source->dev->name : br->dev->name);
//如果已经存在的fdb表项不是local的,则删除这个fdb,创建一个新的静态fdb
fdb_delete(br, fdb);
}
fdb = fdb_create(head, source, addr, vid);
struct net_bridge_fdb_entry *fdb;
fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
if (fdb) {
memcpy(fdb->addr.addr, addr, ETH_ALEN);
//source作为fdb表项的出接口
fdb->dst = source;
fdb->vlan_id = vid;
fdb->is_local = 0;
fdb->is_static = 0;
fdb->added_by_user = 0;
fdb->updated = fdb->used = jiffies;
//将fdb添加到链表
hlist_add_head_rcu(&fdb->hlist, head);
}
return fdb;
fdb->is_local = fdb->is_static = 1;
fdb_add_hw(br, addr);
//将此接口地址添加到bridge中不是混杂模式的接口上
list_for_each_entry(p, &br->port_list, list) {
if (!br_promisc_port(p)) {
err = dev_uc_add(p->dev, addr);
if (err)
goto undo;
}
}
nbp_vlan_init(p)
//default_pvid默认为1
p->br->default_pvid ? nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED) : 0;
struct net_port_vlans *pv = NULL;
pv = rtnl_dereference(port->vlan_info);
if (pv)
return __vlan_add(pv, vid, flags);
pv = kzalloc(sizeof(*pv), GFP_KERNEL);
pv->port_idx = port->port_no;
pv->parent.port = port;
__vlan_add(pv, vid, flags);
br_fdb_insert(br, p, dev->dev_addr, vid);
set_bit(vid, v->vlan_bitmap);
v->num_vlans++;
__vlan_add_flags(v, vid, flags);
if (flags & BRIDGE_VLAN_INFO_PVID)
__vlan_add_pvid(v, vid);
else
__vlan_delete_pvid(v, vid);
if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
set_bit(vid, v->untagged_bitmap);
else
clear_bit(vid, v->untagged_bitmap);
rcu_assign_pointer(port->vlan_info, pv);
changed_addr = br_stp_recalculate_bridge_id(br);
if (netif_running(dev) && netif_oper_up(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
br_init_port(p);
p->port_id = br_make_port_id(p->priority, p->port_no);
br_become_designated_port(p);
br_set_state(p, BR_STATE_BLOCKING);
p->topology_change_ack = 0;
p->config_pending = 0;
br_port_state_selection(p->br);
br_log_state(p);
br_info(p->br, "port %u(%s) entered %s state\n",(unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]);
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
//将桥上所有接口mtu的最小值设置到bridge接口上
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
4. 接收报文处理流程
在协议栈入口函数 __netif_receive_skb_core 调用添加接口时注册的回调函数 br_handle_frame
rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
br_should_route_hook_t *rhook;
//不处理loopback报文
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
//如果源mac地址为全0,或者为组播地址,则drop此报文
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
//取出net_bridge_port结构
p = br_port_get_rcu(skb->dev); //rcu_dereference(dev->rx_handler_data);
//如果目的ip前三字节为01-80-C2-xx-xx-xx,则需要特殊处理此种报文
if (unlikely(is_link_local_ether_addr(dest))) {
/*
* See IEEE 802.1D Table 7-10 Reserved addresses
*
* Assignment Value
* Bridge Group Address 01-80-C2-00-00-00
* (MAC Control) 802.3 01-80-C2-00-00-01
* (Link Aggregation) 802.3 01-80-C2-00-00-02
* 802.1X PAE address 01-80-C2-00-00-03
*
* 802.1AB LLDP 01-80-C2-00-00-0E
*
* Others reserved for future standardization
*/
switch (dest[5]) {
case 0x00: /* Bridge Group Address */
/* If STP is turned off,
then must forward to keep loop detection */
if (p->br->stp_enabled == BR_NO_STP ||
fwd_mask & (1u << dest[5]))
goto forward;
break;
case 0x01: /* IEEE MAC (Pause) */
goto drop;
default:
/* Allow selective forwarding for most other protocols */
fwd_mask |= p->br->group_fwd_mask;
if (fwd_mask & (1u << dest[5]))
goto forward;
}
/* Deliver packet to local host only */
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish)) {
return RX_HANDLER_CONSUMED; /* consumed by filter */
} else {
*pskb = skb;
return RX_HANDLER_PASS; /* continue processing */
}
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
//如果支持 broute
rhook = rcu_dereference(br_should_route_hook); //ebt_broute
if (rhook) {
if ((*rhook)(skb)) {
*pskb = skb;
return RX_HANDLER_PASS;
}
dest = eth_hdr(skb)->h_dest;
}
/* fall through */
case BR_STATE_LEARNING:
//如果报文目的mac是br接口的mac,则设置 PACKET_HOST
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
//netfilter处理
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return RX_HANDLER_CONSUMED;
//广播/组播/未知单播报文flood到所有端口。
//查找到fdb表项的已知单播报文,发送到此表项的出端口。
//广播/组播/已知单播并且dst为locol的报文,或者网桥设备使能了混杂模式,这几种情况都需要通过网桥设备将报文上送本机协议栈处理。
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
bool unicast = true;
u16 vid = 0;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
if (!br_allowed_ingress(p->br, nbp_get_vlan_info(p), skb, &vid))
goto out;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
//更新fdb表项,如果之前没有就新创建
if (p->flags & BR_LEARNING)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
//处理组播报文
if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) &&
br_multicast_rcv(br, p, skb, vid))
goto drop;
if (p->state == BR_STATE_LEARNING)
goto drop;
//将网桥设备保存到 skb 中
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
/* The packet skb2 goes to the local host (NULL to skip). */
//如果skb2不为空,则需要上送本地协议栈
skb2 = NULL;
//如果网桥设备打开了混杂模式,则设置 skb2=skb,说明需要上送本地协议栈
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
//如果是广播报文,则也设置skb2=skb,说明需要上送本地协议栈
if (is_broadcast_ether_addr(dest)) {
skb2 = skb;
unicast = false;
} else if (is_multicast_ether_addr(dest)) {
//组播报文处理
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
if ((mdst && mdst->mglist) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
skb = NULL;
if (!skb2)
goto out;
} else
skb2 = skb;
unicast = false;
br->dev->stats.multicast++;
//根据mac和vid查找fdb,如果目的地为local,则也要设置skb2=skb,说明需要上送本地协议栈
} else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) {
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
//以下四种情况时,skb不为空
//a. 广播报文
//b. 组播报文
//c. 单播报文,查找到了dst并且dst为非local
//d. 单播报文,查找不到dst,未知单播
if (skb) {
if (dst) {
dst->used = jiffies;
//查找到了dst并且dst为非local的单播报文。
//如果网桥设备没有使能混杂模式,则此时skb2为NULL
br_forward(dst->dst, skb, skb2);
if (should_deliver(to, skb)) {
if (skb0)
deliver_clone(to, skb, __br_forward);
else
__br_forward(to, skb);
return;
}
} else
//广播,组播和查找不到dst的单播报文
br_flood_forward(br, skb, skb2, unicast);
br_flood(br, skb, skb2, __br_forward, unicast);
//遍历网桥上所有端口,如果端口满足条件则给此端口发送一份报文
list_for_each_entry_rcu(p, &br->port_list, list) {
/* Do not flood unicast traffic to ports that turn it off */
//单播报文并且端口允许flood,则将报文发给此端口一份.
if (unicast && !(p->flags & BR_FLOOD))
continue;
prev = maybe_deliver(prev, p, skb, __packet_hook);
if (!should_deliver(p, skb))
//此处判断是否应该发给此端口,满足下面三个条件
//此端口不是接收报文端口。或者此端口使能了 hairpin 模式(针对此端口为接收报文端口来说)
//并且报文满足vlan过滤条件或者vlan过滤功能关闭
//并且端口状态为BR_STATE_FORWARDING
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
br_allowed_egress(p->br, nbp_get_vlan_info(p), skb) &&
p->state == BR_STATE_FORWARDING;
return prev;
if (!prev)
goto out;
err = deliver_clone(prev, skb, __packet_hook);
if (err)
return ERR_PTR(err);
out:
return p;
if (IS_ERR(prev))
goto out;
}
if (!prev)
goto out;
if (skb0)
deliver_clone(prev, skb, __packet_hook);
else
__packet_hook(prev, skb);
return;
}
//网桥设备使能了混杂模式,skb2肯定不为NULL
//广播/组播报文
//单播报文,查找到了dst并且dst为local
if (skb2)
return br_pass_frame_up(skb2);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
//将报文发送给指定出端口
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
struct net_device *indev;
if (skb_warn_if_lro(skb)) {
kfree_skb(skb);
return;
}
skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
if (!skb)
return;
indev = skb->dev;
//将skb中的dev换成出端口的dev
skb->dev = to->dev;
skb_forward_csum(skb);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev,
br_forward_finish);
}
int br_forward_finish(struct sk_buff *skb)
{
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
}
int br_dev_queue_push_xmit(struct sk_buff *skb)
{
/* ip_fragment doesn't copy the MAC header */
if (nf_bridge_maybe_copy_header(skb) ||
!is_skb_forwardable(skb->dev, skb)) {
kfree_skb(skb);
} else {
skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
//从网卡发送出去
dev_queue_xmit(skb);
}
return 0;
}
//通过网桥设备将报文上送本机协议栈
static int br_pass_frame_up(struct sk_buff *skb)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
struct net_port_vlans *pv;
u64_stats_update_begin(&brstats->syncp);
brstats->rx_packets++;
brstats->rx_bytes += skb->len;
u64_stats_update_end(&brstats->syncp);
/* Bridge is just like any other port. Make sure the
* packet is allowed except in promisc modue when someone
* may be running packet capture.
*/
pv = br_get_vlan_info(br);
if (!(brdev->flags & IFF_PROMISC) &&
!br_allowed_egress(br, pv, skb)) {
kfree_skb(skb);
return NET_RX_DROP;
}
indev = skb->dev;
//将skb中的dev换成网桥设备的dev
//网桥设备dev没有注册 br_handle_frame,可以经过netif_receive_skb上送到协议栈
skb->dev = brdev;
skb = br_handle_vlan(br, pv, skb);
if (!skb)
return NET_RX_DROP;
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL,
netif_receive_skb);
}
5. 网桥设备发送报文流程
处理比较简单,广播/组播/未知单播报文,flood到所有端口。
能查找到fdb表项的单播报文,从表项的出端口发送出去。
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct net_bridge *br = netdev_priv(dev);
const unsigned char *dest = skb->data;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
u16 vid = 0;
rcu_read_lock();
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
br_nf_pre_routing_finish_bridge_slow(skb);
rcu_read_unlock();
return NETDEV_TX_OK;
}
#endif
u64_stats_update_begin(&brstats->syncp);
brstats->tx_packets++;
brstats->tx_bytes += skb->len;
u64_stats_update_end(&brstats->syncp);
//将网桥设备dev保存到skb
BR_INPUT_SKB_CB(skb)->brdev = dev;
skb_reset_mac_header(skb);
skb_pull(skb, ETH_HLEN);
//是否满足vlan filter或者vlan filter功能关闭
if (!br_allowed_ingress(br, br_get_vlan_info(br), skb, &vid))
goto out;
if (is_broadcast_ether_addr(dest))
//广播报文,发送到所有网桥上的端口
br_flood_deliver(br, skb, false);
br_flood(br, skb, NULL, __br_deliver, unicast);
else if (is_multicast_ether_addr(dest)) {
//组播报文处理,不详细分析了
if (unlikely(netpoll_tx_running(dev))) {
br_flood_deliver(br, skb, false);
goto out;
}
if (br_multicast_rcv(br, NULL, skb, vid)) {
kfree_skb(skb);
goto out;
}
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb)))
br_multicast_deliver(mdst, skb);
else
br_flood_deliver(br, skb, false);
} else if ((dst = __br_fdb_get(br, dest, vid)) != NULL)
br_deliver(dst->dst, skb);
//查找到fdb表项,经过netfilter处理后,最终调用dev_queue_xmit从网卡发送出去
if (to && should_deliver(to, skb)) {
__br_deliver(to, skb);
skb = br_handle_vlan(to->br, nbp_get_vlan_info(to), skb);
skb->dev = to->dev;
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, br_dev_queue_push_xmit);
dev_queue_xmit(skb);
else
//未知单播
br_flood_deliver(br, skb, true);
out:
rcu_read_unlock();
return NETDEV_TX_OK;
?也可参考:linux bridge - mac转发 - 简书 (jianshu.com)
|