7272 */
7373#define NL_DEFAULT_BATCH_SEND_THRESHOLD (15 * NL_PKT_BUF_SIZE)
7474
75+ /*
76+ * RTNLGRP_BIT - Convert an RTNLGRP_* group constant to a bit position
77+ * for the nl_groups bitmask. RTNLGRP constants are 1-based bit numbers,
78+ * so shift by (group - 1). Only valid for groups with bit positions < 32;
79+ * groups >= 32 must use setsockopt(NETLINK_ADD_MEMBERSHIP) via ext_groups.
80+ */
81+ #define FRR_NLGRP_BIT (g ) ((uint32_t)1 << ((g)-1))
82+
7583static const struct message nlmsg_str [] = {
7684 { RTM_NEWROUTE , "RTM_NEWROUTE" },
7785 { RTM_DELROUTE , "RTM_DELROUTE" },
@@ -1616,189 +1624,198 @@ static bool kernel_netlink_nlsock_hash_equal(const void *arg1, const void *arg2)
16161624 return false;
16171625}
16181626
1619- /* Exported interface function. This function simply calls
1620- netlink_socket (). */
1627+ /*
1628+ * Set a netlink socket to non-blocking mode for integration with the
1629+ * event loop. Uses flog_err_sys since this is a kernel/OS-level failure.
1630+ */
1631+ static void netlink_set_nonblock (struct nlsock * nl )
1632+ {
1633+ if (fcntl (nl -> sock , F_SETFL , O_NONBLOCK ) < 0 )
1634+ flog_err_sys (EC_LIB_SOCKET , "Can't set %s socket non-blocking: %s" , nl -> name ,
1635+ safe_strerror (errno ));
1636+ }
1637+
1638+ /*
1639+ * Create, configure, and register a netlink socket. Consolidates the
1640+ * common 5-step init pattern: format name, mark uncreated, create socket,
1641+ * log on failure, and insert into the global nlsock hash.
1642+ *
1643+ * @name_prefix: Prefix for socket name (e.g., "netlink-listen")
1644+ * @groups: Bitmask of RTMGRP/RTNLGRP groups for nl_groups (< 32)
1645+ * @ext_groups: Array of RTNLGRP group IDs >= 32 for setsockopt subscription
1646+ * @ext_group_size: Number of entries in ext_groups[]
1647+ * @ns_id: Network namespace ID
1648+ * @nl_family: Netlink protocol family (NETLINK_ROUTE or NETLINK_GENERIC)
1649+ * @warn_only: true -> log warning on failure, return -1 (non-fatal)
1650+ * false -> log error on failure, caller should exit (fatal)
1651+ *
1652+ * Returns 0 on success, -1 on failure.
1653+ */
1654+ static int kernel_init_nlsock (struct nlsock * nl , const char * name_prefix , unsigned long groups ,
1655+ uint32_t ext_groups [], uint8_t ext_group_size , ns_id_t ns_id ,
1656+ int nl_family , bool warn_only )
1657+ {
1658+ snprintf (nl -> name , sizeof (nl -> name ), "%s (NS %u)" , name_prefix , ns_id );
1659+ nl -> sock = -1 ;
1660+
1661+ if (netlink_socket (nl , groups , ext_groups , ext_group_size , ns_id , nl_family ) < 0 ) {
1662+ if (warn_only )
1663+ zlog_warn ("Failure to create %s socket" , nl -> name );
1664+ else
1665+ flog_err (EC_LIB_SOCKET , "Failure to create %s socket" , nl -> name );
1666+ return -1 ;
1667+ }
1668+
1669+ kernel_netlink_nlsock_insert (nl );
1670+ return 0 ;
1671+ }
1672+
1673+ #if defined SOL_NETLINK
1674+ /*
1675+ * Enable extended ACK messages on a netlink socket. Extended ACKs
1676+ * (Linux 4.2+) provide richer error diagnostics including human-readable
1677+ * error strings and offset information. Non-fatal on failure.
1678+ *
1679+ * @sock: Netlink socket file descriptor
1680+ * @desc: Short socket description for log messages (e.g., "cmd", "dp")
1681+ */
1682+ static void netlink_enable_ext_ack (int sock , const char * desc )
1683+ {
1684+ int one = 1 ;
1685+
1686+ if (setsockopt (sock , SOL_NETLINK , NETLINK_EXT_ACK , & one , sizeof (one )) < 0 )
1687+ zlog_notice ("Registration for extended %s ACK failed: %d %s" , desc , errno ,
1688+ safe_strerror (errno ));
1689+ }
1690+ #endif /* SOL_NETLINK */
1691+
1692+ /*
1693+ * Initialize all netlink sockets and subsystem for a given network namespace.
1694+ *
1695+ * Creates five netlink sockets:
1696+ * netlink - Inbound route/rule/nexthop events (main pthread)
1697+ * netlink_cmd - Outbound synchronous commands (main pthread)
1698+ * netlink_dplane_out - Outbound dataplane programming (dplane pthread)
1699+ * netlink_dplane_in - Inbound link/addr/neigh/netconf/tc events (dplane pthread)
1700+ * ge_netlink_cmd - Generic netlink commands (optional, non-fatal)
1701+ *
1702+ * Also configures: multicast group subscriptions, extended ACK, non-blocking
1703+ * mode, receive buffer sizes, BPF self-echo filters, and event loop registration.
1704+ */
16211705void kernel_init (struct zebra_ns * zns )
16221706{
16231707 uint32_t groups , dplane_groups , ext_groups ;
16241708#if defined SOL_NETLINK
1625- int one , ret , grp ;
1709+ int ret , grp ;
16261710#endif
16271711
1628- /*
1629- * Initialize netlink sockets
1630- *
1631- * If RTMGRP_XXX exists use that, but at some point
1632- * I think the kernel developers realized that
1633- * keeping track of all the different values would
1634- * lead to confusion, so we need to convert the
1635- * RTNLGRP_XXX to a bit position for ourself
1636- *
1637- *
1638- * NOTE: If the bit is >= 32, you must use setsockopt(). Those
1639- * groups are added further below after SOL_NETLINK is verified to
1640- * exist.
1712+ /* ----------------------------------------------------------------
1713+ * Compute multicast group membership bitmasks.
1714+ * Groups < 32 go into nl_groups; groups >= 32 use ext_groups
1715+ * and are subscribed via setsockopt in netlink_socket().
1716+ * ----------------------------------------------------------------
16411717 */
1718+
1719+ /* Main listener: route, rule, and nexthop change notifications */
16421720 groups = RTMGRP_IPV4_ROUTE | RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_MROUTE |
1643- (( uint32_t ) 1 << ( RTNLGRP_IPV4_RULE - 1 ) ) |
1644- (( uint32_t ) 1 << ( RTNLGRP_IPV6_RULE - 1 )) | (( uint32_t ) 1 << ( RTNLGRP_NEXTHOP - 1 ) );
1721+ FRR_NLGRP_BIT ( RTNLGRP_IPV4_RULE ) | FRR_NLGRP_BIT ( RTNLGRP_IPV6_RULE ) |
1722+ FRR_NLGRP_BIT ( RTNLGRP_NEXTHOP );
16451723
1646- dplane_groups = (RTMGRP_LINK | RTMGRP_NEIGH | RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR |
1647- ((uint32_t )1 << (RTNLGRP_IPV4_NETCONF - 1 )) |
1648- ((uint32_t )1 << (RTNLGRP_IPV6_NETCONF - 1 )) |
1649- ((uint32_t )1 << (RTNLGRP_MPLS_NETCONF - 1 )) |
1650- ((uint32_t )1 << (RTNLGRP_TC - 1 )));
1724+ /* Dataplane inbound: link, neighbor, address, netconf, TC events */
1725+ dplane_groups = RTMGRP_LINK | RTMGRP_NEIGH | RTMGRP_IPV4_IFADDR | RTMGRP_IPV6_IFADDR |
1726+ FRR_NLGRP_BIT (RTNLGRP_IPV4_NETCONF ) | FRR_NLGRP_BIT (RTNLGRP_IPV6_NETCONF ) |
1727+ FRR_NLGRP_BIT (RTNLGRP_MPLS_NETCONF ) | FRR_NLGRP_BIT (RTNLGRP_TC );
16511728
1652- /* Use setsockopt for > 31 group */
1729+ /* Extended group: bit position >= 32, requires setsockopt */
16531730 ext_groups = RTNLGRP_TUNNEL ;
16541731
1655- snprintf (zns -> netlink .name , sizeof (zns -> netlink .name ),
1656- "netlink-listen (NS %u)" , zns -> ns_id );
1657- zns -> netlink .sock = -1 ;
1658- if (netlink_socket (& zns -> netlink , groups , & ext_groups , 1 , zns -> ns_id ,
1659- NETLINK_ROUTE ) < 0 ) {
1660- flog_err (EC_LIB_SOCKET , "Failure to create %s socket" , zns -> netlink .name );
1661- frr_exit_with_buffer_flush (-1 );
1662- }
1663-
1664- kernel_netlink_nlsock_insert (& zns -> netlink );
1732+ /* ----------------------------------------------------------------
1733+ * Create netlink sockets. The first four are critical (fatal on
1734+ * failure). The generic netlink socket is optional (warn-only).
1735+ * ----------------------------------------------------------------
1736+ */
16651737
1666- snprintf (zns -> netlink_cmd .name , sizeof (zns -> netlink_cmd .name ),
1667- "netlink-cmd (NS %u)" , zns -> ns_id );
1668- zns -> netlink_cmd .sock = -1 ;
1669- if (netlink_socket (& zns -> netlink_cmd , 0 , 0 , 0 , zns -> ns_id ,
1670- NETLINK_ROUTE ) < 0 ) {
1671- flog_err (EC_LIB_SOCKET , "Failure to create %s socket" , zns -> netlink_cmd .name );
1738+ if (kernel_init_nlsock (& zns -> netlink , "netlink-listen" , groups , & ext_groups , 1 , zns -> ns_id ,
1739+ NETLINK_ROUTE , false) < 0 )
16721740 frr_exit_with_buffer_flush (-1 );
1673- }
16741741
1675- kernel_netlink_nlsock_insert (& zns -> netlink_cmd );
1676-
1677- /* Outbound socket for dplane programming of the host OS. */
1678- snprintf (zns -> netlink_dplane_out .name ,
1679- sizeof (zns -> netlink_dplane_out .name ), "netlink-dp (NS %u)" ,
1680- zns -> ns_id );
1681- zns -> netlink_dplane_out .sock = -1 ;
1682- if (netlink_socket (& zns -> netlink_dplane_out , 0 , 0 , 0 , zns -> ns_id ,
1683- NETLINK_ROUTE ) < 0 ) {
1684- flog_err (EC_LIB_SOCKET , "Failure to create %s socket" ,
1685- zns -> netlink_dplane_out .name );
1742+ if (kernel_init_nlsock (& zns -> netlink_cmd , "netlink-cmd" , 0 , NULL , 0 , zns -> ns_id ,
1743+ NETLINK_ROUTE , false) < 0 )
16861744 frr_exit_with_buffer_flush (-1 );
1687- }
16881745
1689- kernel_netlink_nlsock_insert (& zns -> netlink_dplane_out );
1690-
1691- /* Inbound socket for OS events coming to the dplane. */
1692- snprintf (zns -> netlink_dplane_in .name ,
1693- sizeof (zns -> netlink_dplane_in .name ), "netlink-dp-in (NS %u)" ,
1694- zns -> ns_id );
1695- zns -> netlink_dplane_in .sock = -1 ;
1696- if (netlink_socket (& zns -> netlink_dplane_in , dplane_groups , 0 , 0 ,
1697- zns -> ns_id , NETLINK_ROUTE ) < 0 ) {
1698- flog_err (EC_LIB_SOCKET , "Failure to create %s socket" , zns -> netlink_dplane_in .name );
1746+ if (kernel_init_nlsock (& zns -> netlink_dplane_out , "netlink-dp" , 0 , NULL , 0 , zns -> ns_id ,
1747+ NETLINK_ROUTE , false) < 0 )
16991748 frr_exit_with_buffer_flush (-1 );
1700- }
1701-
1702- kernel_netlink_nlsock_insert (& zns -> netlink_dplane_in );
17031749
1704- /* Generic Netlink socket. */
1705- snprintf (zns -> ge_netlink_cmd .name , sizeof (zns -> ge_netlink_cmd .name ),
1706- "generic-netlink-cmd (NS %u)" , zns -> ns_id );
1707- zns -> ge_netlink_cmd .sock = -1 ;
1708- if (netlink_socket (& zns -> ge_netlink_cmd , 0 , 0 , 0 , zns -> ns_id ,
1709- NETLINK_GENERIC ) < 0 ) {
1710- zlog_warn ("Failure to create %s socket" ,
1711- zns -> ge_netlink_cmd .name );
1712- }
1750+ if (kernel_init_nlsock (& zns -> netlink_dplane_in , "netlink-dp-in" , dplane_groups , NULL , 0 ,
1751+ zns -> ns_id , NETLINK_ROUTE , false) < 0 )
1752+ frr_exit_with_buffer_flush (-1 );
17131753
1714- if (zns -> ge_netlink_cmd .sock >= 0 )
1715- kernel_netlink_nlsock_insert (& zns -> ge_netlink_cmd );
1754+ /* Generic netlink — non-fatal on failure */
1755+ kernel_init_nlsock (& zns -> ge_netlink_cmd , "generic-netlink-cmd" , 0 , NULL , 0 , zns -> ns_id ,
1756+ NETLINK_GENERIC , true);
17161757
1717- /*
1718- * SOL_NETLINK is not available on all platforms yet
1719- * apparently. It's in bits/socket.h which I am not
1720- * sure that we want to pull into our build system.
1758+ /* ----------------------------------------------------------------
1759+ * Platform-specific socket options (SOL_NETLINK).
1760+ * ----------------------------------------------------------------
17211761 */
17221762#if defined SOL_NETLINK
17231763
1724- /*
1725- * setsockopt multicast group subscriptions that don't fit in nl_groups
1726- */
1764+ /* Subscribe dplane inbound to BRVLAN group (bit >= 32) */
17271765 grp = RTNLGRP_BRVLAN ;
17281766 ret = setsockopt (zns -> netlink_dplane_in .sock , SOL_NETLINK ,
17291767 NETLINK_ADD_MEMBERSHIP , & grp , sizeof (grp ));
1730-
17311768 if (ret < 0 )
1732- zlog_notice (
1733- "Registration for RTNLGRP_BRVLAN Membership failed : %d %s" ,
1734- errno , safe_strerror (errno ));
1735- /*
1736- * Let's tell the kernel that we want to receive extended
1737- * ACKS over our command socket(s)
1738- */
1739- one = 1 ;
1740- ret = setsockopt (zns -> netlink_cmd .sock , SOL_NETLINK , NETLINK_EXT_ACK ,
1741- & one , sizeof (one ));
1769+ zlog_notice ("Registration for RTNLGRP_BRVLAN Membership failed: %d %s" , errno ,
1770+ safe_strerror (errno ));
17421771
1743- if (ret < 0 )
1744- zlog_notice ("Registration for extended cmd ACK failed : %d %s" ,
1745- errno , safe_strerror (errno ));
1746-
1747- one = 1 ;
1748- ret = setsockopt (zns -> netlink_dplane_out .sock , SOL_NETLINK ,
1749- NETLINK_EXT_ACK , & one , sizeof (one ));
1750-
1751- if (ret < 0 )
1752- zlog_notice ("Registration for extended dp ACK failed : %d %s" ,
1753- errno , safe_strerror (errno ));
1772+ /* Enable extended ACK on command and dplane output sockets */
1773+ netlink_enable_ext_ack (zns -> netlink_cmd .sock , "cmd" );
1774+ netlink_enable_ext_ack (zns -> netlink_dplane_out .sock , "dp" );
17541775
1776+ /* Enable extended ACK on generic netlink socket (uses flog_err
1777+ * per original behavior — protocol-level failures are significant).
1778+ */
17551779 if (zns -> ge_netlink_cmd .sock >= 0 ) {
1756- one = 1 ;
1780+ int one = 1 ;
1781+
17571782 ret = setsockopt (zns -> ge_netlink_cmd .sock , SOL_NETLINK ,
17581783 NETLINK_EXT_ACK , & one , sizeof (one ));
17591784 if (ret < 0 )
17601785 flog_err (EC_ZEBRA_NETLINK_EXT_ACK_FAILED ,
1761- "Registration for extended generic netlink cmd ACK failed : %d %s" ,
1786+ "Registration for extended generic netlink cmd ACK failed: %d %s" ,
17621787 errno , safe_strerror (errno ));
17631788 }
17641789
1765- /*
1766- * Trim off the payload of the original netlink message in the
1767- * acknowledgment. This option is available since Linux 4.2, so if
1768- * setsockopt fails, ignore the error.
1769- */
1770- one = 1 ;
1771- ret = setsockopt (zns -> netlink_dplane_out .sock , SOL_NETLINK ,
1772- NETLINK_CAP_ACK , & one , sizeof (one ));
1773- if (ret < 0 )
1774- zlog_notice (
1775- "Registration for reduced ACK packet size failed, probably running an early kernel" );
1776- #endif
1777-
1778- /* Register kernel socket. */
1779- if (fcntl (zns -> netlink .sock , F_SETFL , O_NONBLOCK ) < 0 )
1780- flog_err_sys (EC_LIB_SOCKET , "Can't set %s socket flags: %s" ,
1781- zns -> netlink .name , safe_strerror (errno ));
1790+ /* Enable capped ACK to reduce ACK payload size (Linux 4.2+) */
1791+ {
1792+ int one = 1 ;
17821793
1783- if ( fcntl ( zns -> netlink_cmd .sock , F_SETFL , O_NONBLOCK ) < 0 )
1784- flog_err ( EC_LIB_SOCKET , "Can't set %s socket error: %s(%d)" , zns -> netlink_cmd . name ,
1785- safe_strerror ( errno ), errno );
1786-
1787- if ( fcntl ( zns -> netlink_dplane_out . sock , F_SETFL , O_NONBLOCK ) < 0 )
1788- flog_err ( EC_LIB_SOCKET , "Can't set %s socket error: %s(%d)" ,
1789- zns -> netlink_dplane_out . name , safe_strerror ( errno ), errno );
1794+ ret = setsockopt ( zns -> netlink_dplane_out .sock , SOL_NETLINK , NETLINK_CAP_ACK , & one ,
1795+ sizeof ( one ));
1796+ if ( ret < 0 )
1797+ zlog_notice (
1798+ "Registration for reduced ACK packet size failed, probably running an early kernel" );
1799+ }
1800+ #endif /* SOL_NETLINK */
17901801
1791- if (fcntl (zns -> netlink_dplane_in .sock , F_SETFL , O_NONBLOCK ) < 0 )
1792- flog_err (EC_LIB_SOCKET , "Can't set %s socket error: %s(%d)" ,
1793- zns -> netlink_dplane_in .name , safe_strerror (errno ), errno );
1802+ /* ----------------------------------------------------------------
1803+ * Set all sockets to non-blocking mode for event loop integration.
1804+ * ----------------------------------------------------------------
1805+ */
1806+ netlink_set_nonblock (& zns -> netlink );
1807+ netlink_set_nonblock (& zns -> netlink_cmd );
1808+ netlink_set_nonblock (& zns -> netlink_dplane_out );
1809+ netlink_set_nonblock (& zns -> netlink_dplane_in );
17941810
1795- if (zns -> ge_netlink_cmd .sock >= 0 ) {
1796- if (fcntl (zns -> ge_netlink_cmd .sock , F_SETFL , O_NONBLOCK ) < 0 )
1797- flog_err (EC_LIB_SOCKET , "Can't set %s socket error: %s(%d)" ,
1798- zns -> ge_netlink_cmd .name , safe_strerror (errno ), errno );
1799- }
1811+ if (zns -> ge_netlink_cmd .sock >= 0 )
1812+ netlink_set_nonblock (& zns -> ge_netlink_cmd );
18001813
1801- /* Set receive buffer size if it's set from command line */
1814+ /* ----------------------------------------------------------------
1815+ * Configure receive buffer sizes if specified via CLI.
1816+ * Larger buffers prevent message loss during high-volume bursts.
1817+ * ----------------------------------------------------------------
1818+ */
18021819 if (rcvbufsize ) {
18031820 netlink_recvbuf (& zns -> netlink , rcvbufsize );
18041821 netlink_recvbuf (& zns -> netlink_cmd , rcvbufsize );
@@ -1809,23 +1826,26 @@ void kernel_init(struct zebra_ns *zns)
18091826 netlink_recvbuf (& zns -> ge_netlink_cmd , rcvbufsize );
18101827 }
18111828
1812- /* Set filter for inbound sockets, to exclude events we've generated
1813- * ourselves.
1829+ /* ----------------------------------------------------------------
1830+ * Install BPF filters on inbound sockets to suppress self-generated
1831+ * echo messages. Allows through: RTM_NEWADDR, RTM_DELADDR,
1832+ * RTM_NEWNETCONF, RTM_DELNETCONF (these must be processed
1833+ * regardless of origin to keep state in sync).
1834+ * ----------------------------------------------------------------
18141835 */
18151836 netlink_install_filter (zns -> netlink .sock , zns -> netlink_cmd .snl .nl_pid ,
18161837 zns -> netlink_dplane_out .snl .nl_pid );
1817-
18181838 netlink_install_filter (zns -> netlink_dplane_in .sock ,
18191839 zns -> netlink_cmd .snl .nl_pid ,
18201840 zns -> netlink_dplane_out .snl .nl_pid );
18211841
1842+ /* Register main netlink socket with the event loop */
18221843 zns -> t_netlink = NULL ;
1823-
18241844 event_add_read (zrouter .master , kernel_read , zns , zns -> netlink .sock ,
18251845 & zns -> t_netlink );
18261846
1847+ /* Initialize route and generic netlink subsystems */
18271848 rt_netlink_init ();
1828-
18291849 ge_netlink_init (zns );
18301850}
18311851
0 commit comments