Merge pull request #122692 from aroradaman/reject-packets-to-invalid-port

proxy/nftables: reject packets destined for invalid ports of service ips
This commit is contained in:
Kubernetes Prow Robot
2024-01-24 23:17:34 +01:00
committed by GitHub
13 changed files with 457 additions and 150 deletions

View File

@@ -55,6 +55,7 @@ import (
proxyutiliptables "k8s.io/kubernetes/pkg/proxy/util/iptables"
"k8s.io/kubernetes/pkg/util/async"
utilexec "k8s.io/utils/exec"
netutils "k8s.io/utils/net"
"k8s.io/utils/ptr"
)
@@ -63,27 +64,44 @@ const (
// so they don't need any "kube-" or "kube-proxy-" prefix of their own.
kubeProxyTable = "kube-proxy"
// base chains
filterPreroutingChain = "filter-prerouting"
filterInputChain = "filter-input"
filterForwardChain = "filter-forward"
filterOutputChain = "filter-output"
filterOutputPostDNATChain = "filter-output-post-dnat"
natPreroutingChain = "nat-prerouting"
natOutputChain = "nat-output"
natPostroutingChain = "nat-postrouting"
// service dispatch
kubeServicesChain = "services"
kubeServiceIPsMap = "service-ips"
kubeServiceNodePortsMap = "service-nodeports"
servicesChain = "services"
serviceIPsMap = "service-ips"
serviceNodePortsMap = "service-nodeports"
// set of IPs that accept NodePort traffic
kubeNodePortIPsSet = "nodeport-ips"
nodePortIPsSet = "nodeport-ips"
// set of active ClusterIPs.
clusterIPsSet = "cluster-ips"
// handling for services with no endpoints
kubeEndpointsCheckChain = "endpoints-check"
kubeNoEndpointServicesMap = "no-endpoint-services"
kubeNoEndpointNodePortsMap = "no-endpoint-nodeports"
kubeRejectChain = "reject-chain"
serviceEndpointsCheckChain = "service-endpoints-check"
nodePortEndpointsCheckChain = "nodeport-endpoints-check"
noEndpointServicesMap = "no-endpoint-services"
noEndpointNodePortsMap = "no-endpoint-nodeports"
rejectChain = "reject-chain"
// handling traffic to unallocated ClusterIPs and undefined ports of ClusterIPs
clusterIPsCheckChain = "cluster-ips-check"
// LoadBalancerSourceRanges handling
kubeFirewallIPsMap = "firewall-ips"
kubeFirewallCheckChain = "firewall-check"
firewallIPsMap = "firewall-ips"
firewallCheckChain = "firewall-check"
// masquerading
kubeMarkMasqChain = "mark-for-masquerade"
kubeMasqueradingChain = "masquerading"
markMasqChain = "mark-for-masquerade"
masqueradingChain = "masquerading"
)
// internal struct for string service information
@@ -179,6 +197,10 @@ type Proxier struct {
// staleChains contains information about chains to be deleted later
staleChains map[string]time.Time
// serviceCIDRs is a comma separated list of ServiceCIDRs belonging to the IPFamily
// which proxier is operating on, can be directly consumed by knftables.
serviceCIDRs string
}
// Proxier implements proxy.Provider
@@ -309,13 +331,14 @@ type nftablesBaseChain struct {
var nftablesBaseChains = []nftablesBaseChain{
// We want our filtering rules to operate on pre-DNAT dest IPs, so our filter
// chains have to run before DNAT.
{"filter-prerouting", knftables.FilterType, knftables.PreroutingHook, knftables.DNATPriority + "-10"},
{"filter-input", knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-10"},
{"filter-forward", knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-10"},
{"filter-output", knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-10"},
{"nat-prerouting", knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority},
{"nat-output", knftables.NATType, knftables.OutputHook, knftables.DNATPriority},
{"nat-postrouting", knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority},
{filterPreroutingChain, knftables.FilterType, knftables.PreroutingHook, knftables.DNATPriority + "-10"},
{filterInputChain, knftables.FilterType, knftables.InputHook, knftables.DNATPriority + "-10"},
{filterForwardChain, knftables.FilterType, knftables.ForwardHook, knftables.DNATPriority + "-10"},
{filterOutputChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "-10"},
{filterOutputPostDNATChain, knftables.FilterType, knftables.OutputHook, knftables.DNATPriority + "+10"},
{natPreroutingChain, knftables.NATType, knftables.PreroutingHook, knftables.DNATPriority},
{natOutputChain, knftables.NATType, knftables.OutputHook, knftables.DNATPriority},
{natPostroutingChain, knftables.NATType, knftables.PostroutingHook, knftables.SNATPriority},
}
// nftablesJumpChains lists our top-level "regular chains" that are jumped to directly
@@ -328,19 +351,23 @@ type nftablesJumpChain struct {
}
var nftablesJumpChains = []nftablesJumpChain{
// We can't jump to kubeEndpointsCheckChain from filter-prerouting like
// kubeFirewallCheckChain because reject action is only valid in chains using the
// We can't jump to endpointsCheckChain from filter-prerouting like
// firewallCheckChain because reject action is only valid in chains using the
// input, forward or output hooks.
{kubeEndpointsCheckChain, "filter-input", "ct state new"},
{kubeEndpointsCheckChain, "filter-forward", "ct state new"},
{kubeEndpointsCheckChain, "filter-output", "ct state new"},
{nodePortEndpointsCheckChain, filterInputChain, "ct state new"},
{serviceEndpointsCheckChain, filterInputChain, "ct state new"},
{serviceEndpointsCheckChain, filterForwardChain, "ct state new"},
{serviceEndpointsCheckChain, filterOutputChain, "ct state new"},
{kubeFirewallCheckChain, "filter-prerouting", "ct state new"},
{kubeFirewallCheckChain, "filter-output", "ct state new"},
{firewallCheckChain, filterPreroutingChain, "ct state new"},
{firewallCheckChain, filterOutputChain, "ct state new"},
{kubeServicesChain, "nat-output", ""},
{kubeServicesChain, "nat-prerouting", ""},
{kubeMasqueradingChain, "nat-postrouting", ""},
{servicesChain, natOutputChain, ""},
{servicesChain, natPreroutingChain, ""},
{masqueradingChain, natPostroutingChain, ""},
{clusterIPsCheckChain, filterForwardChain, "ct state new"},
{clusterIPsCheckChain, filterOutputPostDNATChain, "ct state new"},
}
// ensureChain adds commands to tx to ensure that chain exists and doesn't contain
@@ -399,51 +426,79 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
}
// Ensure all of our other "top-level" chains exist
for _, chain := range []string{kubeServicesChain, kubeMasqueradingChain, kubeMarkMasqChain} {
for _, chain := range []string{servicesChain, clusterIPsCheckChain, masqueradingChain, markMasqChain} {
ensureChain(chain, tx, createdChains)
}
// Add the rules in the mark-for-masquerade and masquerading chains
tx.Add(&knftables.Rule{
Chain: kubeMarkMasqChain,
Chain: markMasqChain,
Rule: knftables.Concat(
"mark", "set", "mark", "or", proxier.masqueradeMark,
),
})
tx.Add(&knftables.Rule{
Chain: kubeMasqueradingChain,
Chain: masqueradingChain,
Rule: knftables.Concat(
"mark", "and", proxier.masqueradeMark, "==", "0",
"return",
),
})
tx.Add(&knftables.Rule{
Chain: kubeMasqueradingChain,
Chain: masqueradingChain,
Rule: knftables.Concat(
"mark", "set", "mark", "xor", proxier.masqueradeMark,
),
})
tx.Add(&knftables.Rule{
Chain: kubeMasqueradingChain,
Chain: masqueradingChain,
Rule: "masquerade fully-random",
})
// add cluster-ips set.
tx.Add(&knftables.Set{
Name: clusterIPsSet,
Type: ipvX_addr,
Comment: ptr.To("Active ClusterIPs"),
})
// reject traffic to invalid ports of ClusterIPs.
tx.Add(&knftables.Rule{
Chain: clusterIPsCheckChain,
Rule: knftables.Concat(
ipX, "daddr", "@", clusterIPsSet, "reject",
),
Comment: ptr.To("Reject traffic to invalid ports of ClusterIPs"),
})
// drop traffic to unallocated ClusterIPs.
if len(proxier.serviceCIDRs) > 0 {
tx.Add(&knftables.Rule{
Chain: clusterIPsCheckChain,
Rule: knftables.Concat(
ipX, "daddr", "{", proxier.serviceCIDRs, "}",
"drop",
),
Comment: ptr.To("Drop traffic to unallocated ClusterIPs"),
})
}
// Fill in nodeport-ips set if needed (or delete it if not). (We do "add+delete"
// rather than just "delete" when we want to ensure the set doesn't exist, because
// doing just "delete" would return an error if the set didn't exist.)
tx.Add(&knftables.Set{
Name: kubeNodePortIPsSet,
Name: nodePortIPsSet,
Type: ipvX_addr,
Comment: ptr.To("IPs that accept NodePort traffic"),
})
if proxier.nodePortAddresses.MatchAll() {
tx.Delete(&knftables.Set{
Name: kubeNodePortIPsSet,
Name: nodePortIPsSet,
})
} else {
tx.Flush(&knftables.Set{
Name: kubeNodePortIPsSet,
Name: nodePortIPsSet,
})
nodeIPs, err := proxier.nodePortAddresses.GetNodeIPs(proxier.networkInterfacer)
if err != nil {
@@ -455,7 +510,7 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
continue
}
tx.Add(&knftables.Element{
Set: kubeNodePortIPsSet,
Set: nodePortIPsSet,
Key: []string{
ip.String(),
},
@@ -465,108 +520,107 @@ func (proxier *Proxier) setupNFTables(tx *knftables.Transaction) {
// Set up "no endpoints" drop/reject handling
tx.Add(&knftables.Map{
Name: kubeNoEndpointServicesMap,
Name: noEndpointServicesMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("vmap to drop or reject packets to services with no endpoints"),
})
tx.Add(&knftables.Map{
Name: kubeNoEndpointNodePortsMap,
Name: noEndpointNodePortsMap,
Type: "inet_proto . inet_service : verdict",
Comment: ptr.To("vmap to drop or reject packets to service nodeports with no endpoints"),
})
tx.Add(&knftables.Chain{
Name: kubeRejectChain,
Name: rejectChain,
Comment: ptr.To("helper for @no-endpoint-services / @no-endpoint-nodeports"),
})
tx.Flush(&knftables.Chain{
Name: kubeRejectChain,
Name: rejectChain,
})
tx.Add(&knftables.Rule{
Chain: kubeRejectChain,
Chain: rejectChain,
Rule: "reject",
})
tx.Add(&knftables.Rule{
Chain: kubeEndpointsCheckChain,
Chain: serviceEndpointsCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeNoEndpointServicesMap,
"vmap", "@", noEndpointServicesMap,
),
})
if proxier.nodePortAddresses.MatchAll() {
tx.Add(&knftables.Rule{
Chain: kubeEndpointsCheckChain,
Chain: nodePortEndpointsCheckChain,
Rule: knftables.Concat(
"fib daddr type local",
noLocalhost,
"meta l4proto . th dport",
"vmap", "@", kubeNoEndpointNodePortsMap,
"vmap", "@", noEndpointNodePortsMap,
),
})
} else {
tx.Add(&knftables.Rule{
Chain: kubeEndpointsCheckChain,
Chain: nodePortEndpointsCheckChain,
Rule: knftables.Concat(
ipX, "daddr", "@", kubeNodePortIPsSet,
ipX, "daddr", "@", nodePortIPsSet,
"meta l4proto . th dport",
"vmap", "@", kubeNoEndpointNodePortsMap,
"vmap", "@", noEndpointNodePortsMap,
),
})
}
// Set up LoadBalancerSourceRanges firewalling
tx.Add(&knftables.Map{
Name: kubeFirewallIPsMap,
Name: firewallIPsMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("destinations that are subject to LoadBalancerSourceRanges"),
})
ensureChain(kubeFirewallCheckChain, tx, createdChains)
ensureChain(firewallCheckChain, tx, createdChains)
tx.Add(&knftables.Rule{
Chain: kubeFirewallCheckChain,
Chain: firewallCheckChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeFirewallIPsMap,
"vmap", "@", firewallIPsMap,
),
})
// Set up service dispatch
tx.Add(&knftables.Map{
Name: kubeServiceIPsMap,
Name: serviceIPsMap,
Type: ipvX_addr + " . inet_proto . inet_service : verdict",
Comment: ptr.To("ClusterIP, ExternalIP and LoadBalancer IP traffic"),
})
tx.Add(&knftables.Map{
Name: kubeServiceNodePortsMap,
Name: serviceNodePortsMap,
Type: "inet_proto . inet_service : verdict",
Comment: ptr.To("NodePort traffic"),
})
tx.Add(&knftables.Rule{
Chain: kubeServicesChain,
Chain: servicesChain,
Rule: knftables.Concat(
ipX, "daddr", ".", "meta l4proto", ".", "th dport",
"vmap", "@", kubeServiceIPsMap,
"vmap", "@", serviceIPsMap,
),
})
if proxier.nodePortAddresses.MatchAll() {
tx.Add(&knftables.Rule{
Chain: kubeServicesChain,
Chain: servicesChain,
Rule: knftables.Concat(
"fib daddr type local",
noLocalhost,
"meta l4proto . th dport",
"vmap", "@", kubeServiceNodePortsMap,
"vmap", "@", serviceNodePortsMap,
),
})
} else {
tx.Add(&knftables.Rule{
Chain: kubeServicesChain,
Chain: servicesChain,
Rule: knftables.Concat(
ipX, "daddr @nodeport-ips",
"meta l4proto . th dport",
"vmap", "@", kubeServiceNodePortsMap,
"vmap", "@", serviceNodePortsMap,
),
})
}
@@ -764,6 +818,26 @@ func (proxier *Proxier) OnNodeDelete(node *v1.Node) {
func (proxier *Proxier) OnNodeSynced() {
}
// OnServiceCIDRsChanged is called whenever a change is observed
// in any of the ServiceCIDRs, and provides complete list of service cidrs.
func (proxier *Proxier) OnServiceCIDRsChanged(cidrs []string) {
proxier.mu.Lock()
defer proxier.mu.Unlock()
cidrsForProxier := make([]string, 0)
for _, cidr := range cidrs {
isIPv4CIDR := netutils.IsIPv4CIDRString(cidr)
if proxier.ipFamily == v1.IPv4Protocol && isIPv4CIDR {
cidrsForProxier = append(cidrsForProxier, cidr)
}
if proxier.ipFamily == v1.IPv6Protocol && !isIPv4CIDR {
cidrsForProxier = append(cidrsForProxier, cidr)
}
}
proxier.serviceCIDRs = strings.Join(cidrsForProxier, ",")
}
const (
// Maximum length for one of our chain name prefixes, including the trailing
// hyphen.
@@ -955,20 +1029,23 @@ func (proxier *Proxier) syncProxyRules() {
}
// We currently fully-rebuild our sets and maps on each resync
tx.Flush(&knftables.Map{
Name: kubeFirewallIPsMap,
tx.Flush(&knftables.Set{
Name: clusterIPsSet,
})
tx.Flush(&knftables.Map{
Name: kubeNoEndpointServicesMap,
Name: firewallIPsMap,
})
tx.Flush(&knftables.Map{
Name: kubeNoEndpointNodePortsMap,
Name: noEndpointServicesMap,
})
tx.Flush(&knftables.Map{
Name: kubeServiceIPsMap,
Name: noEndpointNodePortsMap,
})
tx.Flush(&knftables.Map{
Name: kubeServiceNodePortsMap,
Name: serviceIPsMap,
})
tx.Flush(&knftables.Map{
Name: serviceNodePortsMap,
})
// Accumulate service/endpoint chains and affinity sets to keep.
@@ -1074,8 +1151,8 @@ func (proxier *Proxier) syncProxyRules() {
// generate any chains in the "nat" table for the service; only
// rules in the "filter" table rejecting incoming packets for
// the service's IPs.
internalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain)
externalTrafficFilterVerdict = fmt.Sprintf("goto %s", kubeRejectChain)
internalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain)
externalTrafficFilterVerdict = fmt.Sprintf("goto %s", rejectChain)
} else {
if !hasInternalEndpoints {
// The internalTrafficPolicy is "Local" but there are no local
@@ -1095,9 +1172,13 @@ func (proxier *Proxier) syncProxyRules() {
}
// Capture the clusterIP.
tx.Add(&knftables.Element{
Set: clusterIPsSet,
Key: []string{svcInfo.ClusterIP().String()},
})
if hasInternalEndpoints {
tx.Add(&knftables.Element{
Map: kubeServiceIPsMap,
Map: serviceIPsMap,
Key: []string{
svcInfo.ClusterIP().String(),
protocol,
@@ -1110,7 +1191,7 @@ func (proxier *Proxier) syncProxyRules() {
} else {
// No endpoints.
tx.Add(&knftables.Element{
Map: kubeNoEndpointServicesMap,
Map: noEndpointServicesMap,
Key: []string{
svcInfo.ClusterIP().String(),
protocol,
@@ -1129,7 +1210,7 @@ func (proxier *Proxier) syncProxyRules() {
// Send traffic bound for external IPs to the "external
// destinations" chain.
tx.Add(&knftables.Element{
Map: kubeServiceIPsMap,
Map: serviceIPsMap,
Key: []string{
externalIP.String(),
protocol,
@@ -1145,7 +1226,7 @@ func (proxier *Proxier) syncProxyRules() {
// external traffic (DROP anything that didn't get
// short-circuited by the EXT chain.)
tx.Add(&knftables.Element{
Map: kubeNoEndpointServicesMap,
Map: noEndpointServicesMap,
Key: []string{
externalIP.String(),
protocol,
@@ -1197,7 +1278,7 @@ func (proxier *Proxier) syncProxyRules() {
for _, lbip := range svcInfo.LoadBalancerVIPs() {
if hasEndpoints {
tx.Add(&knftables.Element{
Map: kubeServiceIPsMap,
Map: serviceIPsMap,
Key: []string{
lbip.String(),
protocol,
@@ -1211,7 +1292,7 @@ func (proxier *Proxier) syncProxyRules() {
if usesFWChain {
tx.Add(&knftables.Element{
Map: kubeFirewallIPsMap,
Map: firewallIPsMap,
Key: []string{
lbip.String(),
protocol,
@@ -1230,7 +1311,7 @@ func (proxier *Proxier) syncProxyRules() {
// by the EXT chain.)
for _, lbip := range svcInfo.LoadBalancerVIPs() {
tx.Add(&knftables.Element{
Map: kubeNoEndpointServicesMap,
Map: noEndpointServicesMap,
Key: []string{
lbip.String(),
protocol,
@@ -1251,7 +1332,7 @@ func (proxier *Proxier) syncProxyRules() {
// worse, nodeports are not subect to loadBalancerSourceRanges,
// and we can't change that.
tx.Add(&knftables.Element{
Map: kubeServiceNodePortsMap,
Map: serviceNodePortsMap,
Key: []string{
protocol,
strconv.Itoa(svcInfo.NodePort()),
@@ -1266,7 +1347,7 @@ func (proxier *Proxier) syncProxyRules() {
// external traffic (DROP anything that didn't get
// short-circuited by the EXT chain.)
tx.Add(&knftables.Element{
Map: kubeNoEndpointNodePortsMap,
Map: noEndpointNodePortsMap,
Key: []string{
protocol,
strconv.Itoa(svcInfo.NodePort()),
@@ -1287,7 +1368,7 @@ func (proxier *Proxier) syncProxyRules() {
Rule: knftables.Concat(
ipX, "daddr", svcInfo.ClusterIP(),
protocol, "dport", svcInfo.Port(),
"jump", kubeMarkMasqChain,
"jump", markMasqChain,
),
})
} else if proxier.localDetector.IsImplemented() {
@@ -1302,7 +1383,7 @@ func (proxier *Proxier) syncProxyRules() {
ipX, "daddr", svcInfo.ClusterIP(),
protocol, "dport", svcInfo.Port(),
proxier.localDetector.IfNotLocalNFT(),
"jump", kubeMarkMasqChain,
"jump", markMasqChain,
),
})
}
@@ -1319,7 +1400,7 @@ func (proxier *Proxier) syncProxyRules() {
tx.Add(&knftables.Rule{
Chain: externalTrafficChain,
Rule: knftables.Concat(
"jump", kubeMarkMasqChain,
"jump", markMasqChain,
),
})
} else {
@@ -1348,7 +1429,7 @@ func (proxier *Proxier) syncProxyRules() {
Chain: externalTrafficChain,
Rule: knftables.Concat(
"fib", "saddr", "type", "local",
"jump", kubeMarkMasqChain,
"jump", markMasqChain,
),
Comment: ptr.To("masquerade local traffic"),
})
@@ -1441,7 +1522,7 @@ func (proxier *Proxier) syncProxyRules() {
Chain: endpointChain,
Rule: knftables.Concat(
ipX, "saddr", epInfo.IP(),
"jump", kubeMarkMasqChain,
"jump", markMasqChain,
),
})