Added rotation metric to certificate manager
This commit is contained in:
		| @@ -113,8 +113,17 @@ type Config struct { | ||||
| 	// quickly replaced with a unique cert/key pair. | ||||
| 	BootstrapKeyPEM []byte | ||||
| 	// CertificateExpiration will record a metric that shows the remaining | ||||
| 	// lifetime of the certificate. | ||||
| 	// lifetime of the certificate. This metric is a gauge because only the | ||||
| 	// current cert expiry time is really useful. Reading this metric at any | ||||
| 	// time simply gives the next expiration date, no need to keep some | ||||
| 	// history (histogram) of all previous expiry dates. | ||||
| 	CertificateExpiration Gauge | ||||
| 	// CertificateRotation will record a metric showing the time in seconds | ||||
| 	// that certificates lived before being rotated. This metric is a histogram | ||||
| 	// because there is value in keeping a history of rotation cadences. It | ||||
| 	// allows one to setup monitoring and alerting of unexpected rotation | ||||
| 	// behavior and track trends in rotation frequency. | ||||
| 	CertificateRotation Histogram | ||||
| } | ||||
|  | ||||
| // Store is responsible for getting and updating the current certificate. | ||||
| @@ -139,6 +148,12 @@ type Gauge interface { | ||||
| 	Set(float64) | ||||
| } | ||||
|  | ||||
| // Histogram will record the time a rotated certificate was used before being | ||||
| // rotated. | ||||
| type Histogram interface { | ||||
| 	Observe(float64) | ||||
| } | ||||
|  | ||||
| // NoCertKeyError indicates there is no cert/key currently available. | ||||
| type NoCertKeyError string | ||||
|  | ||||
| @@ -163,6 +178,7 @@ type manager struct { | ||||
| 	certStore Store | ||||
|  | ||||
| 	certificateExpiration Gauge | ||||
| 	certificateRotation   Histogram | ||||
|  | ||||
| 	// the following variables must only be accessed under certAccessLock | ||||
| 	certAccessLock sync.RWMutex | ||||
| @@ -174,6 +190,9 @@ type manager struct { | ||||
| 	clientFn         CSRClientFunc | ||||
| 	stopCh           chan struct{} | ||||
| 	stopped          bool | ||||
|  | ||||
| 	// Set to time.Now but can be stubbed out for testing | ||||
| 	now func() time.Time | ||||
| } | ||||
|  | ||||
| // NewManager returns a new certificate manager. A certificate manager is | ||||
| @@ -203,6 +222,8 @@ func NewManager(config *Config) (Manager, error) { | ||||
| 		cert:                  cert, | ||||
| 		forceRotation:         forceRotation, | ||||
| 		certificateExpiration: config.CertificateExpiration, | ||||
| 		certificateRotation:   config.CertificateRotation, | ||||
| 		now:                   time.Now, | ||||
| 	} | ||||
|  | ||||
| 	return &m, nil | ||||
| @@ -215,7 +236,7 @@ func NewManager(config *Config) (Manager, error) { | ||||
| func (m *manager) Current() *tls.Certificate { | ||||
| 	m.certAccessLock.RLock() | ||||
| 	defer m.certAccessLock.RUnlock() | ||||
| 	if m.cert != nil && m.cert.Leaf != nil && time.Now().After(m.cert.Leaf.NotAfter) { | ||||
| 	if m.cert != nil && m.cert.Leaf != nil && m.now().After(m.cert.Leaf.NotAfter) { | ||||
| 		klog.V(2).Infof("Current certificate is expired.") | ||||
| 		return nil | ||||
| 	} | ||||
| @@ -256,7 +277,7 @@ func (m *manager) Start() { | ||||
| 	templateChanged := make(chan struct{}) | ||||
| 	go wait.Until(func() { | ||||
| 		deadline := m.nextRotationDeadline() | ||||
| 		if sleepInterval := deadline.Sub(time.Now()); sleepInterval > 0 { | ||||
| 		if sleepInterval := deadline.Sub(m.now()); sleepInterval > 0 { | ||||
| 			klog.V(2).Infof("Waiting %v for next certificate rotation", sleepInterval) | ||||
|  | ||||
| 			timer := time.NewTimer(sleepInterval) | ||||
| @@ -421,7 +442,10 @@ func (m *manager) rotateCerts() (bool, error) { | ||||
| 		return false, nil | ||||
| 	} | ||||
|  | ||||
| 	m.updateCached(cert) | ||||
| 	if old := m.updateCached(cert); old != nil && m.certificateRotation != nil { | ||||
| 		m.certificateRotation.Observe(m.now().Sub(old.Leaf.NotBefore).Seconds()) | ||||
| 	} | ||||
|  | ||||
| 	return true, nil | ||||
| } | ||||
|  | ||||
| @@ -490,14 +514,14 @@ func (m *manager) nextRotationDeadline() time.Time { | ||||
| 	// forceRotation is not protected by locks | ||||
| 	if m.forceRotation { | ||||
| 		m.forceRotation = false | ||||
| 		return time.Now() | ||||
| 		return m.now() | ||||
| 	} | ||||
|  | ||||
| 	m.certAccessLock.RLock() | ||||
| 	defer m.certAccessLock.RUnlock() | ||||
|  | ||||
| 	if !m.certSatisfiesTemplateLocked() { | ||||
| 		return time.Now() | ||||
| 		return m.now() | ||||
| 	} | ||||
|  | ||||
| 	notAfter := m.cert.Leaf.NotAfter | ||||
| @@ -523,13 +547,15 @@ var jitteryDuration = func(totalDuration float64) time.Duration { | ||||
| 	return wait.Jitter(time.Duration(totalDuration), 0.2) - time.Duration(totalDuration*0.3) | ||||
| } | ||||
|  | ||||
| // updateCached sets the most recent retrieved cert. It also sets the server | ||||
| // as assumed healthy. | ||||
| func (m *manager) updateCached(cert *tls.Certificate) { | ||||
| // updateCached sets the most recent retrieved cert and returns the old cert. | ||||
| // It also sets the server as assumed healthy. | ||||
| func (m *manager) updateCached(cert *tls.Certificate) *tls.Certificate { | ||||
| 	m.certAccessLock.Lock() | ||||
| 	defer m.certAccessLock.Unlock() | ||||
| 	m.serverHealth = true | ||||
| 	old := m.cert | ||||
| 	m.cert = cert | ||||
| 	return old | ||||
| } | ||||
|  | ||||
| // updateServerError takes an error returned by the server and infers | ||||
|   | ||||
| @@ -163,12 +163,17 @@ func TestNewManagerNoRotation(t *testing.T) { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| type gaugeMock struct { | ||||
| type metricMock struct { | ||||
| 	calls     int | ||||
| 	lastValue float64 | ||||
| } | ||||
|  | ||||
| func (g *gaugeMock) Set(v float64) { | ||||
| func (g *metricMock) Set(v float64) { | ||||
| 	g.calls++ | ||||
| 	g.lastValue = v | ||||
| } | ||||
|  | ||||
| func (g *metricMock) Observe(v float64) { | ||||
| 	g.calls++ | ||||
| 	g.lastValue = v | ||||
| } | ||||
| @@ -195,7 +200,7 @@ func TestSetRotationDeadline(t *testing.T) { | ||||
|  | ||||
| 	for _, tc := range testCases { | ||||
| 		t.Run(tc.name, func(t *testing.T) { | ||||
| 			g := gaugeMock{} | ||||
| 			g := metricMock{} | ||||
| 			m := manager{ | ||||
| 				cert: &tls.Certificate{ | ||||
| 					Leaf: &x509.Certificate{ | ||||
| @@ -206,6 +211,7 @@ func TestSetRotationDeadline(t *testing.T) { | ||||
| 				getTemplate:           func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, | ||||
| 				usages:                []certificates.KeyUsage{}, | ||||
| 				certificateExpiration: &g, | ||||
| 				now:                   func() time.Time { return now }, | ||||
| 			} | ||||
| 			jitteryDuration = func(float64) time.Duration { return time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7) } | ||||
| 			lowerBound := tc.notBefore.Add(time.Duration(float64(tc.notAfter.Sub(tc.notBefore)) * 0.7)) | ||||
| @@ -383,6 +389,7 @@ func TestCertSatisfiesTemplate(t *testing.T) { | ||||
| 			m := manager{ | ||||
| 				cert:        tlsCert, | ||||
| 				getTemplate: func() *x509.CertificateRequest { return tc.template }, | ||||
| 				now:         time.Now, | ||||
| 			} | ||||
|  | ||||
| 			result := m.certSatisfiesTemplate() | ||||
| @@ -407,6 +414,7 @@ func TestRotateCertCreateCSRError(t *testing.T) { | ||||
| 		clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { | ||||
| 			return fakeClient{failureType: createError}, nil | ||||
| 		}, | ||||
| 		now: func() time.Time { return now }, | ||||
| 	} | ||||
|  | ||||
| 	if success, err := m.rotateCerts(); success { | ||||
| @@ -430,6 +438,7 @@ func TestRotateCertWaitingForResultError(t *testing.T) { | ||||
| 		clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { | ||||
| 			return fakeClient{failureType: watchError}, nil | ||||
| 		}, | ||||
| 		now: func() time.Time { return now }, | ||||
| 	} | ||||
|  | ||||
| 	defer func(t time.Duration) { certificateWaitTimeout = t }(certificateWaitTimeout) | ||||
| @@ -945,6 +954,40 @@ func TestServerHealth(t *testing.T) { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestRotationLogsDuration(t *testing.T) { | ||||
| 	h := metricMock{} | ||||
| 	now := time.Now() | ||||
| 	certIss := now.Add(-2 * time.Hour) | ||||
| 	m := manager{ | ||||
| 		cert: &tls.Certificate{ | ||||
| 			Leaf: &x509.Certificate{ | ||||
| 				NotBefore: certIss, | ||||
| 				NotAfter:  now.Add(-1 * time.Hour), | ||||
| 			}, | ||||
| 		}, | ||||
| 		certStore:   &fakeStore{cert: expiredStoreCertData.certificate}, | ||||
| 		getTemplate: func() *x509.CertificateRequest { return &x509.CertificateRequest{} }, | ||||
| 		clientFn: func(_ *tls.Certificate) (certificatesclient.CertificateSigningRequestInterface, error) { | ||||
| 			return &fakeClient{ | ||||
| 				certificatePEM: apiServerCertData.certificatePEM, | ||||
| 			}, nil | ||||
| 		}, | ||||
| 		certificateRotation: &h, | ||||
| 		now:                 func() time.Time { return now }, | ||||
| 	} | ||||
| 	ok, err := m.rotateCerts() | ||||
| 	if err != nil || !ok { | ||||
| 		t.Errorf("failed to rotate certs: %v", err) | ||||
| 	} | ||||
| 	if h.calls != 1 { | ||||
| 		t.Errorf("rotation metric was not called") | ||||
| 	} | ||||
| 	if h.lastValue != now.Sub(certIss).Seconds() { | ||||
| 		t.Errorf("rotation metric did not record the right value got: %f; want %f", h.lastValue, now.Sub(certIss).Seconds()) | ||||
| 	} | ||||
|  | ||||
| } | ||||
|  | ||||
| type fakeClientFailureType int | ||||
|  | ||||
| const ( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Samuel Davidson
					Samuel Davidson