From 79fad91dbb3b1f1adc4f70da9f458c60987aa4d7 Mon Sep 17 00:00:00 2001 From: "Jason T. Greene" Date: Mon, 15 Jan 2024 17:53:30 -0600 Subject: [PATCH] Add a net health recovery service to Qemu machines There is a network stability issue in qemu + virtio, affecting some users after long periods of usage, which can lead to suspended queue delivery. Until the issue is resolved, add a temporary recovery service which restarts networking when host communication becomes inoperable. [NO NEW TESTS NEEDED] Signed-off-by: Jason T. Greene --- pkg/machine/ignition/ignition.go | 86 +++++++++++++++++++++++++---- pkg/machine/qemu/machine.go | 17 +++--- pkg/machine/qemu/options_darwin.go | 4 ++ pkg/machine/qemu/options_freebsd.go | 4 ++ pkg/machine/qemu/options_linux.go | 4 ++ pkg/machine/qemu/options_windows.go | 4 ++ 6 files changed, 100 insertions(+), 19 deletions(-) diff --git a/pkg/machine/ignition/ignition.go b/pkg/machine/ignition/ignition.go index 8e2cf1c408..19f4b22cc2 100644 --- a/pkg/machine/ignition/ignition.go +++ b/pkg/machine/ignition/ignition.go @@ -53,15 +53,16 @@ func GetNodeGrp(grpName string) NodeGroup { } type DynamicIgnition struct { - Name string - Key string - TimeZone string - UID int - VMName string - VMType define.VMType - WritePath string - Cfg Config - Rootful bool + Name string + Key string + TimeZone string + UID int + VMName string + VMType define.VMType + WritePath string + Cfg Config + Rootful bool + NetRecover bool } func (ign *DynamicIgnition) Write() error { @@ -97,7 +98,7 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error { ignStorage := Storage{ Directories: getDirs(ign.Name), - Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType), + Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType, ign.NetRecover), Links: getLinks(ign.Name), } @@ -231,6 +232,21 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error { } ignSystemd.Units = append(ignSystemd.Units, qemuUnit) } + + if ign.NetRecover { + contents, err := GetNetRecoveryUnitFile().ToString() + if err != nil { + return err + } + + recoveryUnit := Unit{ + Enabled: BoolToPtr(true), + Name: "net-health-recovery.service", + Contents: &contents, + } + ignSystemd.Units = append(ignSystemd.Units, recoveryUnit) + } + // Only after all checks are done // it's ready create the ingConfig ign.Cfg = Config{ @@ -303,7 +319,7 @@ func getDirs(usrName string) []Directory { return dirs } -func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType) []File { +func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType, netRecover bool) []File { files := make([]File, 0) lingerExample := parser.NewUnitFile() @@ -574,6 +590,23 @@ Delegate=memory pids cpu io }, }) + // Only necessary for qemu on mac + if netRecover { + files = append(files, File{ + Node: Node{ + User: GetNodeUsr("root"), + Group: GetNodeGrp("root"), + Path: "/usr/local/bin/net-health-recovery.sh", + }, + FileEmbedded1: FileEmbedded1{ + Mode: IntToPtr(0755), + Contents: Resource{ + Source: EncodeDataURLPtr(GetNetRecoveryFile()), + }, + }, + }) + } + return files } @@ -743,6 +776,37 @@ func (i *IgnitionBuilder) Build() error { return i.dynamicIgnition.Write() } +func GetNetRecoveryFile() string { + return `#!/bin/bash +# Verify network health, and bounce the network device if host connectivity +# is lost. This is a temporary workaround for a known rare qemu/virtio issue +# that affects some systems + +sleep 120 # allow time for network setup on initial boot +while true; do + sleep 30 + curl -s -o /dev/null --max-time 30 http://192.168.127.1/health + if [ "$?" != "0" ]; then + echo "bouncing nic due to loss of connectivity with host" + ifconfig enp0s1 down; ifconfig enp0s1 up + fi +done +` +} + +func GetNetRecoveryUnitFile() *parser.UnitFile { + recoveryUnit := parser.NewUnitFile() + recoveryUnit.Add("Unit", "Description", "Verifies health of network and recovers if necessary") + recoveryUnit.Add("Unit", "After", "sshd.socket sshd.service") + recoveryUnit.Add("Service", "ExecStart", "/usr/local/bin/net-health-recovery.sh") + recoveryUnit.Add("Service", "StandardOutput", "journal") + recoveryUnit.Add("Service", "StandardError", "journal") + recoveryUnit.Add("Service", "StandardInput", "null") + recoveryUnit.Add("Install", "WantedBy", "default.target") + + return recoveryUnit +} + func DefaultReadyUnitFile() parser.UnitFile { u := parser.NewUnitFile() u.Add("Unit", "After", "remove-moby.service sshd.socket sshd.service") diff --git a/pkg/machine/qemu/machine.go b/pkg/machine/qemu/machine.go index ea37098de5..3059ed82c6 100644 --- a/pkg/machine/qemu/machine.go +++ b/pkg/machine/qemu/machine.go @@ -194,14 +194,15 @@ func (v *MachineVM) Init(opts machine.InitOptions) (bool, error) { } builder := ignition.NewIgnitionBuilder(ignition.DynamicIgnition{ - Name: opts.Username, - Key: key, - VMName: v.Name, - VMType: define.QemuVirt, - TimeZone: opts.TimeZone, - WritePath: v.getIgnitionFile(), - UID: v.UID, - Rootful: v.Rootful, + Name: opts.Username, + Key: key, + VMName: v.Name, + VMType: define.QemuVirt, + TimeZone: opts.TimeZone, + WritePath: v.getIgnitionFile(), + UID: v.UID, + Rootful: v.Rootful, + NetRecover: useNetworkRecover(), }) // If the user provides an ignition file, we need to diff --git a/pkg/machine/qemu/options_darwin.go b/pkg/machine/qemu/options_darwin.go index 124358db80..052ddbccf7 100644 --- a/pkg/machine/qemu/options_darwin.go +++ b/pkg/machine/qemu/options_darwin.go @@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) { } return tmpDir, nil } + +func useNetworkRecover() bool { + return true +} diff --git a/pkg/machine/qemu/options_freebsd.go b/pkg/machine/qemu/options_freebsd.go index 124358db80..94f01a3800 100644 --- a/pkg/machine/qemu/options_freebsd.go +++ b/pkg/machine/qemu/options_freebsd.go @@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) { } return tmpDir, nil } + +func useNetworkRecover() bool { + return false +} diff --git a/pkg/machine/qemu/options_linux.go b/pkg/machine/qemu/options_linux.go index 15b162aeb4..04303d402e 100644 --- a/pkg/machine/qemu/options_linux.go +++ b/pkg/machine/qemu/options_linux.go @@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) { } return util.GetRootlessRuntimeDir() } + +func useNetworkRecover() bool { + return false +} diff --git a/pkg/machine/qemu/options_windows.go b/pkg/machine/qemu/options_windows.go index 69652ee39e..2ccdac2cb1 100644 --- a/pkg/machine/qemu/options_windows.go +++ b/pkg/machine/qemu/options_windows.go @@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) { } return tmpDir, nil } + +func useNetworkRecover() bool { + return false +}