test: Stop kubelet systemd service after node e2e
Currently, when running node e2e it's not possible to use the ginkgo `--repeat` flag to run the test suite multiple times. This is useful when debugging tests and ensuring they are not flaky by re-running them several times. Currently if using `--repeat` ginkgo flag, the 2nd run of the test will fail due to kubelet not starting with message like: ``` Failed to start transient service unit: Unit kubelet-20221020T040841.service already exists. ``` This is because during the test startup, kubelet is started as a transient unit file via `systemd-run`. The unit is started with the `--remain-after-exit` flag to ensure that the unit will remain even if the kubelet is restarted. The test suite currently uses `systemd kill` command to stop kubelet. This works fine for stopping the kubelet, but on the second run, when `systemd-run` is used to start systemd unit again it will fail because the unit already exists. This is because `systemd kill` will not delete the systemd unit, only send SIGTERM signal to it. To fix this, add `unitName` as a field to the `server` struct. When kubelet server is constructed, set the unit name. As part of e2e test termination, in `E2EServices.Stop()``, stop the kubelet systemd unit. By stopping the kubelet systemd unit, systemd will delete the systemd transient unit, allowing it to be created and started again in a subsequent e2e run. Signed-off-by: David Porter <david@porter.me>
This commit is contained in:
@@ -64,11 +64,13 @@ type server struct {
|
||||
stopRestartingCh chan<- bool
|
||||
// Read from this to confirm that the restart loop has stopped.
|
||||
ackStopRestartingCh <-chan bool
|
||||
// The systemd unit name for the service if it exists. If server is not managed by systemd, field is empty.
|
||||
systemdUnitName string
|
||||
}
|
||||
|
||||
// newServer returns a new server with the given name, commands, health check
|
||||
// URLs, etc.
|
||||
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool) *server {
|
||||
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, monitorParent, restartOnExit bool, systemdUnitName string) *server {
|
||||
return &server{
|
||||
name: name,
|
||||
startCommand: start,
|
||||
@@ -78,6 +80,7 @@ func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outpu
|
||||
outFilename: outputFileName,
|
||||
monitorParent: monitorParent,
|
||||
restartOnExit: restartOnExit,
|
||||
systemdUnitName: systemdUnitName,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,3 +316,14 @@ func (s *server) kill() error {
|
||||
|
||||
return fmt.Errorf("unable to stop %q", name)
|
||||
}
|
||||
|
||||
func (s *server) stopUnit() error {
|
||||
klog.Infof("Stopping systemd unit for server %q with unit name: %q", s.name, s.systemdUnitName)
|
||||
if s.systemdUnitName != "" {
|
||||
err := exec.Command("sudo", "systemctl", "stop", s.systemdUnitName).Run()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to stop systemd unit name: %q: %v", s.systemdUnitName, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user