kube-proxy: change implementation of LoadBalancerSourceRanges for wider kernel support

The nftables implementation made use of concatenation of ranges when
creating the set "firewall-allow", but the support was not available
before kernel 5.6. Therefore, nftables mode couldn't run on earlier
kernels, while 5.4 is still widely used.

An alternative of concatenation of ranges is to create a separate
firewall chain for every service port that needs firewalling, and jump
to the service's firewall chain from the common firewall chain via a
rule with vmap.

Renaming from "firewall" to "firewall-ips" is required when changing the
set to the map to support existing clusters to upgrade, otherwise it
would fail to create the map. Besides, "firewall-ips" corresponds to the
"service-ips" map, later we can add use "firewall-nodeports" if it's
determined that NodePort traffic should be subject to
LoadBalancerSourceRanges.

Signed-off-by: Quan Tian <qtian@vmware.com>
This commit is contained in:
Quan Tian
2023-12-11 17:38:11 +08:00
parent 40c729c680
commit 377f521038
3 changed files with 96 additions and 85 deletions

View File

@@ -76,10 +76,8 @@ const (
kubeRejectChain = "reject-chain"
// LoadBalancerSourceRanges handling
kubeFirewallSet = "firewall"
kubeFirewallCheckChain = "firewall-check"
kubeFirewallAllowSet = "firewall-allow"
kubeFirewallAllowCheckChain = "firewall-allow-check"
kubeFirewallIPsMap = "firewall-ips"
kubeFirewallCheckChain = "firewall-check"
// masquerading
kubeMarkMasqChain = "mark-for-masquerade"
@@ -99,6 +97,7 @@ type servicePortInfo struct {
clusterPolicyChainName string
localPolicyChainName string
externalChainName string
firewallChainName string
}
// returns a new proxy.ServicePort which abstracts a serviceInfo
@@ -114,6 +113,7 @@ func newServiceInfo(port *v1.ServicePort, service *v1.Service, bsvcPortInfo *pro
svcPort.clusterPolicyChainName = servicePortPolicyClusterChainNamePrefix + chainNameBase
svcPort.localPolicyChainName = servicePortPolicyLocalChainNamePrefix + chainNameBase
svcPort.externalChainName = serviceExternalChainNamePrefix + chainNameBase
svcPort.firewallChainName = servicePortFirewallChainNamePrefix + chainNameBase
return svcPort
}
@@ -543,38 +543,20 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
}
// Set up LoadBalancerSourceRanges firewalling
tx.Add(&knftables.Set{
Name: kubeFirewallSet,
Type: ipvX_addr + " . inet_proto . inet_service",
tx.Add(&knftables.Map{
Name: kubeFirewallIPsMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
})
tx.Add(&knftables.Set{
Name: kubeFirewallAllowSet,
Type: ipvX_addr + " . inet_proto . inet_service . " + ipvX_addr,
Flags: []knftables.SetFlag{knftables.IntervalFlag},
Comment: ptr.To("destinations+sources that are allowed by LoadBalancerSourceRanges"),
})
ensureChain(kubeFirewallCheckChain, tx, createdChains)
ensureChain(kubeFirewallAllowCheckChain, tx, createdChains)
tx.Add(&knftables.Rule{
Chain: kubeFirewallCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", "@", kubeFirewallSet,
"jump", kubeFirewallAllowCheckChain,
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeFirewallIPsMap,
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport", ".", ipX, "saddr", "@", kubeFirewallAllowSet,
"return",
),
})
tx.Add(&knftables.Rule{
Chain: kubeFirewallAllowCheckChain,
Rule: "drop",
})
// Set up service dispatch
tx.Add(&knftables.Map{
@@ -824,6 +806,7 @@ const (
serviceExternalChainNamePrefix = "external-"
servicePortEndpointChainNamePrefix = "endpoint-"
servicePortEndpointAffinityNamePrefix = "affinity-"
servicePortFirewallChainNamePrefix = "firewall-"
)
// hashAndTruncate prefixes name with a hash of itself and then truncates to
@@ -998,11 +981,8 @@ func (proxier *Proxier) syncProxyRules() {
}
// We currently fully-rebuild our sets and maps on each resync
tx.Flush(&knftables.Set{
Name: kubeFirewallSet,
})
tx.Flush(&knftables.Set{
Name: kubeFirewallAllowSet,
tx.Flush(&knftables.Map{
Name: kubeFirewallIPsMap,
})
tx.Flush(&knftables.Map{
Name: kubeNoEndpointServicesMap,
@@ -1205,6 +1185,44 @@ func (proxier *Proxier) syncProxyRules() {
}
}
usesFWChain := len(svcInfo.LoadBalancerVIPStrings()) > 0 && len(svcInfo.LoadBalancerSourceRanges()) > 0
fwChain := svcInfo.firewallChainName
if usesFWChain {
ensureChain(fwChain, tx, activeChains)
var sources []string
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
if len(sources) > 0 {
sources = append(sources, ",")
}
sources = append(sources, src)
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
sources = append(sources, ",", lbip)
}
}
tx.Add(&knftables.Rule{
Chain: fwChain,
Rule: knftables.Concat(
ipX, "saddr", "!=", "{", sources, "}",
"drop",
),
})
}
// Capture load-balancer ingress.
for _, lbip := range svcInfo.LoadBalancerVIPStrings() {
if hasEndpoints {
@@ -1221,53 +1239,19 @@ func (proxier *Proxier) syncProxyRules() {
})
}
if len(svcInfo.LoadBalancerSourceRanges()) > 0 {
if usesFWChain {
tx.Add(&knftables.Element{
Set: kubeFirewallSet,
Map: kubeFirewallIPsMap,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
},
Value: []string{
fmt.Sprintf("goto %s", fwChain),
},
Comment: &svcPortNameString,
})
allowFromNode := false
for _, src := range svcInfo.LoadBalancerSourceRanges() {
_, cidr, _ := netutils.ParseCIDRSloppy(src)
if cidr == nil {
continue
}
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
src,
},
Comment: &svcPortNameString,
})
if cidr.Contains(proxier.nodeIP) {
allowFromNode = true
}
}
// For VIP-like LBs, the VIP is often added as a local
// address (via an IP route rule). In that case, a request
// from a node to the VIP will not hit the loadbalancer but
// will loop back with the source IP set to the VIP. We
// need the following rules to allow requests from this node.
if allowFromNode {
tx.Add(&knftables.Element{
Set: kubeFirewallAllowSet,
Key: []string{
lbip,
protocol,
strconv.Itoa(svcInfo.Port()),
lbip,
},
})
}
}
}
if !hasExternalEndpoints {