Merge pull request #122296 from tnqn/nftables-kernel-requirement
kube-proxy: change implementation of LoadBalancerSourceRanges for wider kernel support
This commit is contained in:
		| @@ -210,6 +210,17 @@ func (tracer *nftablesTracer) addressMatches(ipStr, not, ruleAddress string) boo | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (tracer *nftablesTracer) noneAddressesMatch(ipStr, ruleAddress string) bool { | ||||
| 	ruleAddress = strings.ReplaceAll(ruleAddress, " ", "") | ||||
| 	addresses := strings.Split(ruleAddress, ",") | ||||
| 	for _, address := range addresses { | ||||
| 		if tracer.addressMatches(ipStr, "", address) { | ||||
| 			return false | ||||
| 		} | ||||
| 	} | ||||
| 	return true | ||||
| } | ||||
|  | ||||
| // matchDestIPOnly checks an "ip daddr" against a set/map, and returns the matching | ||||
| // Element, if found. | ||||
| func (tracer *nftablesTracer) matchDestIPOnly(elements []*knftables.Element, destIP string) *knftables.Element { | ||||
| @@ -267,6 +278,7 @@ func (tracer *nftablesTracer) matchDestPort(elements []*knftables.Element, proto | ||||
| // match verdictRegexp. | ||||
|  | ||||
| var destAddrRegexp = regexp.MustCompile(`^ip6* daddr (!= )?(\S+)`) | ||||
| var destAddrLookupRegexp = regexp.MustCompile(`^ip6* daddr != \{([^}]*)\}`) | ||||
| var destAddrLocalRegexp = regexp.MustCompile(`^fib daddr type local`) | ||||
| var destPortRegexp = regexp.MustCompile(`^(tcp|udp|sctp) dport (\d+)`) | ||||
| var destIPOnlyLookupRegexp = regexp.MustCompile(`^ip6* daddr @(\S+)`) | ||||
| @@ -278,6 +290,7 @@ var destDispatchRegexp = regexp.MustCompile(`^ip6* daddr \. meta l4proto \. th d | ||||
| var destPortDispatchRegexp = regexp.MustCompile(`^meta l4proto \. th dport vmap @(\S+)$`) | ||||
|  | ||||
| var sourceAddrRegexp = regexp.MustCompile(`^ip6* saddr (!= )?(\S+)`) | ||||
| var sourceAddrLookupRegexp = regexp.MustCompile(`^ip6* saddr != \{([^}]*)\}`) | ||||
| var sourceAddrLocalRegexp = regexp.MustCompile(`^fib saddr type local`) | ||||
|  | ||||
| var endpointVMAPRegexp = regexp.MustCompile(`^numgen random mod \d+ vmap \{(.*)\}$`) | ||||
| @@ -400,6 +413,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP | ||||
| 					rule = element.Value[0] | ||||
| 				} | ||||
|  | ||||
| 			case destAddrLookupRegexp.MatchString(rule): | ||||
| 				// `^ip6* daddr != \{([^}]*)\}` | ||||
| 				// Tests whether destIP doesn't match an anonymous set. | ||||
| 				match := destAddrLookupRegexp.FindStringSubmatch(rule) | ||||
| 				rule = strings.TrimPrefix(rule, match[0]) | ||||
| 				if !tracer.noneAddressesMatch(destIP, match[1]) { | ||||
| 					rule = "" | ||||
| 					break | ||||
| 				} | ||||
|  | ||||
| 			case destAddrRegexp.MatchString(rule): | ||||
| 				// `^ip6* daddr (!= )?(\S+)` | ||||
| 				// Tests whether destIP does/doesn't match a literal. | ||||
| @@ -432,6 +455,16 @@ func (tracer *nftablesTracer) runChain(chname, sourceIP, protocol, destIP, destP | ||||
| 					break | ||||
| 				} | ||||
|  | ||||
| 			case sourceAddrLookupRegexp.MatchString(rule): | ||||
| 				// `^ip6* saddr != \{([^}]*)\}` | ||||
| 				// Tests whether sourceIP doesn't match an anonymous set. | ||||
| 				match := sourceAddrLookupRegexp.FindStringSubmatch(rule) | ||||
| 				rule = strings.TrimPrefix(rule, match[0]) | ||||
| 				if !tracer.noneAddressesMatch(sourceIP, match[1]) { | ||||
| 					rule = "" | ||||
| 					break | ||||
| 				} | ||||
|  | ||||
| 			case sourceAddrRegexp.MatchString(rule): | ||||
| 				// `^ip6* saddr (!= )?(\S+)` | ||||
| 				// Tests whether sourceIP does/doesn't match a literal. | ||||
|   | ||||
| @@ -79,10 +79,8 @@ const ( | ||||
| 	kubeRejectChain            = "reject-chain" | ||||
|  | ||||
| 	// LoadBalancerSourceRanges handling | ||||
| 	kubeFirewallSet             = "firewall" | ||||
| 	kubeFirewallCheckChain      = "firewall-check" | ||||
| 	kubeFirewallAllowSet        = "firewall-allow" | ||||
| 	kubeFirewallAllowCheckChain = "firewall-allow-check" | ||||
| 	kubeFirewallIPsMap     = "firewall-ips" | ||||
| 	kubeFirewallCheckChain = "firewall-check" | ||||
|  | ||||
| 	// masquerading | ||||
| 	kubeMarkMasqChain     = "mark-for-masquerade" | ||||
| @@ -102,6 +100,7 @@ type servicePortInfo struct { | ||||
| 	clusterPolicyChainName string | ||||
| 	localPolicyChainName   string | ||||
| 	externalChainName      string | ||||
| 	firewallChainName      string | ||||
| } | ||||
|  | ||||
| // returns a new proxy.ServicePort which abstracts a serviceInfo | ||||
| @@ -117,6 +116,7 @@ func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *pro | ||||
| 	svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase | ||||
| 	svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase | ||||
| 	svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase | ||||
| 	svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase | ||||
|  | ||||
| 	return svcPort | ||||
| } | ||||
| @@ -546,38 +546,20 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) { | ||||
| 	} | ||||
|  | ||||
| 	// Set up LoadBalancerSourceRanges firewalling | ||||
| 	tx.Add(&knftables.Set{ | ||||
| 		Name:    kubeFirewallSet, | ||||
| 		Type:    ipvX_addr + " . inet_proto . inet_service", | ||||
| 	tx.Add(&knftables.Map{ | ||||
| 		Name:    kubeFirewallIPsMap, | ||||
| 		Type:    ipvX_addr + " . inet_proto . inet_service : verdict", | ||||
| 		Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"), | ||||
| 	}) | ||||
| 	tx.Add(&knftables.Set{ | ||||
| 		Name:    kubeFirewallAllowSet, | ||||
| 		Type:    ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr, | ||||
| 		Flags:   []knftables.SetFlag{knftables.IntervalFlag}, | ||||
| 		Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"), | ||||
| 	}) | ||||
|  | ||||
| 	ensureChain(kubeFirewallCheckChain, tx, createdChains) | ||||
| 	ensureChain(kubeFirewallAllowCheckChain, tx, createdChains) | ||||
| 	tx.Add(&knftables.Rule{ | ||||
| 		Chain: kubeFirewallCheckChain, | ||||
| 		Rule: knftables.Concat( | ||||
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet, | ||||
| 			"jump", kubeFirewallAllowCheckChain, | ||||
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport", | ||||
| 			"vmap", "@", kubeFirewallIPsMap, | ||||
| 		), | ||||
| 	}) | ||||
| 	tx.Add(&knftables.Rule{ | ||||
| 		Chain: kubeFirewallAllowCheckChain, | ||||
| 		Rule: knftables.Concat( | ||||
| 			ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet, | ||||
| 			"return", | ||||
| 		), | ||||
| 	}) | ||||
| 	tx.Add(&knftables.Rule{ | ||||
| 		Chain: kubeFirewallAllowCheckChain, | ||||
| 		Rule:  "drop", | ||||
| 	}) | ||||
|  | ||||
| 	// Set up service dispatch | ||||
| 	tx.Add(&knftables.Map{ | ||||
| @@ -827,6 +809,7 @@ const ( | ||||
| 	serviceExternalChainNamePrefix          = "external-" | ||||
| 	servicePortEndpointChainNamePrefix      = "endpoint-" | ||||
| 	servicePortEndpointAffinityNamePrefix   = "affinity-" | ||||
| 	servicePortFirewallChainNamePrefix      = "firewall-" | ||||
| ) | ||||
|  | ||||
| // hashAndTruncate prefixes name with a hash of itself and then truncates to | ||||
| @@ -1001,11 +984,8 @@ func (proxier *Proxier) syncProxyRules() { | ||||
| 	} | ||||
|  | ||||
| 	// We currently fully-rebuild our sets and maps on each resync | ||||
| 	tx.Flush(&knftables.Set{ | ||||
| 		Name: kubeFirewallSet, | ||||
| 	}) | ||||
| 	tx.Flush(&knftables.Set{ | ||||
| 		Name: kubeFirewallAllowSet, | ||||
| 	tx.Flush(&knftables.Map{ | ||||
| 		Name: kubeFirewallIPsMap, | ||||
| 	}) | ||||
| 	tx.Flush(&knftables.Map{ | ||||
| 		Name: kubeNoEndpointServicesMap, | ||||
| @@ -1208,6 +1188,44 @@ func (proxier *Proxier) syncProxyRules() { | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		usesFWChain := len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0 | ||||
| 		fwChain := svcInfo.firewallChainName | ||||
| 		if usesFWChain { | ||||
| 			ensureChain(fwChain, tx, activeChains) | ||||
| 			var sources []string | ||||
| 			allowFromNode := false | ||||
| 			for _, src := range svcInfo.LoadBalancerSourceRanges() { | ||||
| 				_, cidr, _ := netutils.ParseCIDRSloppy(src) | ||||
| 				if cidr == nil { | ||||
| 					continue | ||||
| 				} | ||||
| 				if len(sources) > 0 { | ||||
| 					sources = append(sources, ",") | ||||
| 				} | ||||
| 				sources = append(sources, src) | ||||
| 				if cidr.Contains(proxier.nodeIP) { | ||||
| 					allowFromNode = true | ||||
| 				} | ||||
| 			} | ||||
| 			// For VIP-like LBs, the VIP is often added as a local | ||||
| 			// address (via an IP route rule).  In that case, a request | ||||
| 			// from a node to the VIP will not hit the loadbalancer but | ||||
| 			// will loop back with the source IP set to the VIP.  We | ||||
| 			// need the following rules to allow requests from this node. | ||||
| 			if allowFromNode { | ||||
| 				for _, lbip := range svcInfo.LoadBalancerVIPStrings() { | ||||
| 					sources = append(sources, ",", lbip) | ||||
| 				} | ||||
| 			} | ||||
| 			tx.Add(&knftables.Rule{ | ||||
| 				Chain: fwChain, | ||||
| 				Rule: knftables.Concat( | ||||
| 					ipX, "saddr", "!=", "{", sources, "}", | ||||
| 					"drop", | ||||
| 				), | ||||
| 			}) | ||||
| 		} | ||||
|  | ||||
| 		// Capture load-balancer ingress. | ||||
| 		for _, lbip := range svcInfo.LoadBalancerVIPStrings() { | ||||
| 			if hasEndpoints { | ||||
| @@ -1224,53 +1242,19 @@ func (proxier *Proxier) syncProxyRules() { | ||||
| 				}) | ||||
| 			} | ||||
|  | ||||
| 			if len(svcInfo.LoadBalancerSourceRanges()) > 0 { | ||||
| 			if usesFWChain { | ||||
| 				tx.Add(&knftables.Element{ | ||||
| 					Set: kubeFirewallSet, | ||||
| 					Map: kubeFirewallIPsMap, | ||||
| 					Key: []string{ | ||||
| 						lbip, | ||||
| 						protocol, | ||||
| 						strconv.Itoa(svcInfo.Port()), | ||||
| 					}, | ||||
| 					Value: []string{ | ||||
| 						fmt.Sprintf("goto %s", fwChain), | ||||
| 					}, | ||||
| 					Comment: &svcPortNameString, | ||||
| 				}) | ||||
|  | ||||
| 				allowFromNode := false | ||||
| 				for _, src := range svcInfo.LoadBalancerSourceRanges() { | ||||
| 					_, cidr, _ := netutils.ParseCIDRSloppy(src) | ||||
| 					if cidr == nil { | ||||
| 						continue | ||||
| 					} | ||||
| 					tx.Add(&knftables.Element{ | ||||
| 						Set: kubeFirewallAllowSet, | ||||
| 						Key: []string{ | ||||
| 							lbip, | ||||
| 							protocol, | ||||
| 							strconv.Itoa(svcInfo.Port()), | ||||
| 							src, | ||||
| 						}, | ||||
| 						Comment: &svcPortNameString, | ||||
| 					}) | ||||
| 					if cidr.Contains(proxier.nodeIP) { | ||||
| 						allowFromNode = true | ||||
| 					} | ||||
| 				} | ||||
| 				// For VIP-like LBs, the VIP is often added as a local | ||||
| 				// address (via an IP route rule).  In that case, a request | ||||
| 				// from a node to the VIP will not hit the loadbalancer but | ||||
| 				// will loop back with the source IP set to the VIP.  We | ||||
| 				// need the following rules to allow requests from this node. | ||||
| 				if allowFromNode { | ||||
| 					tx.Add(&knftables.Element{ | ||||
| 						Set: kubeFirewallAllowSet, | ||||
| 						Key: []string{ | ||||
| 							lbip, | ||||
| 							protocol, | ||||
| 							strconv.Itoa(svcInfo.Port()), | ||||
| 							lbip, | ||||
| 						}, | ||||
| 					}) | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 		if !hasExternalEndpoints { | ||||
|   | ||||
| @@ -524,13 +524,9 @@ func TestOverallNFTablesRules(t *testing.T) { | ||||
| 		add chain ip kube-proxy nat-prerouting { type nat hook prerouting priority -100 ; } | ||||
| 		add rule ip kube-proxy nat-prerouting jump services | ||||
|  | ||||
| 		add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } | ||||
| 		add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; } | ||||
| 		add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } | ||||
| 		add chain ip kube-proxy firewall-check | ||||
| 		add chain ip kube-proxy firewall-allow-check | ||||
| 		add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return | ||||
| 		add rule ip kube-proxy firewall-allow-check drop | ||||
| 		add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check | ||||
| 		add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips | ||||
|  | ||||
| 		add chain ip kube-proxy reject-chain { comment "helper for @no-endpoint-services / @no-endpoint-nodeports" ; } | ||||
| 		add rule ip kube-proxy reject-chain reject | ||||
| @@ -625,11 +621,13 @@ func TestOverallNFTablesRules(t *testing.T) { | ||||
| 		add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 update @affinity-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 { ip saddr } | ||||
| 		add rule ip kube-proxy endpoint-GTK6MW7G-ns5/svc5/tcp/p80__10.180.0.3/80 meta l4proto tcp dnat to 10.180.0.3:80 | ||||
|  | ||||
| 		add chain ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80 | ||||
| 		add rule ip kube-proxy firewall-HVFWP5L3-ns5/svc5/tcp/p80 ip saddr != { 203.0.113.0/25 } drop | ||||
|  | ||||
| 		add element ip kube-proxy service-ips { 172.30.0.45 . tcp . 80 : goto service-HVFWP5L3-ns5/svc5/tcp/p80 } | ||||
| 		add element ip kube-proxy service-ips { 5.6.7.8 . tcp . 80 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 } | ||||
| 		add element ip kube-proxy service-nodeports { tcp . 3002 : goto external-HVFWP5L3-ns5/svc5/tcp/p80 } | ||||
| 		add element ip kube-proxy firewall { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" } | ||||
| 		add element ip kube-proxy firewall-allow { 5.6.7.8 . tcp . 80 . 203.0.113.0/25 comment "ns5/svc5:p80" } | ||||
| 		add element ip kube-proxy firewall-ips { 5.6.7.8 . tcp . 80 comment "ns5/svc5:p80" : goto firewall-HVFWP5L3-ns5/svc5/tcp/p80 } | ||||
|  | ||||
| 		# svc6 | ||||
| 		add element ip kube-proxy no-endpoint-services { 172.30.0.46 . tcp . 80 comment "ns6/svc6:p80" : goto reject-chain } | ||||
| @@ -4267,7 +4265,6 @@ func TestSyncProxyRulesRepeated(t *testing.T) { | ||||
| 		add chain ip kube-proxy filter-forward { type filter hook forward priority -101 ; } | ||||
| 		add chain ip kube-proxy filter-input { type filter hook input priority -101 ; } | ||||
| 		add chain ip kube-proxy filter-output { type filter hook output priority -101 ; } | ||||
| 		add chain ip kube-proxy firewall-allow-check | ||||
| 		add chain ip kube-proxy firewall-check | ||||
| 		add chain ip kube-proxy forward | ||||
| 		add chain ip kube-proxy mark-for-masquerade | ||||
| @@ -4287,9 +4284,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) { | ||||
| 		add rule ip kube-proxy filter-input ct state new jump firewall-check | ||||
| 		add rule ip kube-proxy filter-output ct state new jump endpoints-check | ||||
| 		add rule ip kube-proxy filter-output ct state new jump firewall-check | ||||
| 		add rule ip kube-proxy firewall-allow-check ip daddr . meta l4proto . th dport . ip saddr @firewall-allow return | ||||
| 		add rule ip kube-proxy firewall-allow-check drop | ||||
| 		add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport @firewall jump firewall-allow-check | ||||
| 		add rule ip kube-proxy firewall-check ip daddr . meta l4proto . th dport vmap @firewall-ips | ||||
| 		add rule ip kube-proxy forward ct state invalid drop | ||||
| 		add rule ip kube-proxy mark-for-masquerade mark set mark or 0x4000 | ||||
| 		add rule ip kube-proxy masquerading mark and 0x4000 == 0 return | ||||
| @@ -4302,8 +4297,7 @@ func TestSyncProxyRulesRepeated(t *testing.T) { | ||||
| 		add rule ip kube-proxy services ip daddr . meta l4proto . th dport vmap @service-ips | ||||
| 		add rule ip kube-proxy services fib daddr type local ip daddr != 127.0.0.0/8 meta l4proto . th dport vmap @service-nodeports | ||||
|  | ||||
| 		add set ip kube-proxy firewall { type ipv4_addr . inet_proto . inet_service ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } | ||||
| 		add set ip kube-proxy firewall-allow { type ipv4_addr . inet_proto . inet_service . ipv4_addr ; flags interval ; comment "destinations+sources that are allowed by LoadBalancerSourceRanges" ; } | ||||
| 		add map ip kube-proxy firewall-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "destinations that are subject to LoadBalancerSourceRanges" ; } | ||||
| 		add map ip kube-proxy no-endpoint-nodeports { type inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to service nodeports with no endpoints" ; } | ||||
| 		add map ip kube-proxy no-endpoint-services { type ipv4_addr . inet_proto . inet_service : verdict ; comment "vmap to drop or reject packets to services with no endpoints" ; } | ||||
| 		add map ip kube-proxy service-ips { type ipv4_addr . inet_proto . inet_service : verdict ; comment "ClusterIP, ExternalIP and LoadBalancer IP traffic" ; } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot