From 5d6ce1831d8dbbef7a9bc02570ef625d0f1e15e1 Mon Sep 17 00:00:00 2001 From: Christian Theune Date: Fri, 11 Oct 2024 16:55:05 +0200 Subject: [PATCH] guest agent: revamp synchronisation and assist thawing We think that this is the cause of PL-132809 (VMs unavailable due to hung tasks / guest freeze). The previous way to establish a connection was broken as it misunderstood the synchronisation mechanism and the way the guest agent tries to send pending messages to the socket. The new approach first tries to send a gratuitous thaw, flush out any pending messages and then tries once to ping and synchronize - otherwise erroring out immediately. Also, when freezing we try harder to thaw if we failed and we increased the freezing timeout to 5 minutes. Fixes PL-132809 --- CHANGES.txt | 6 +- src/fc/qemu/agent.py | 41 +++--- src/fc/qemu/hazmat/conftest.py | 5 + src/fc/qemu/hazmat/guestagent.py | 143 +++++++++++++------- src/fc/qemu/hazmat/qemu.py | 78 ++++++----- src/fc/qemu/hazmat/tests/guestagent_test.py | 41 +++--- src/fc/qemu/hazmat/tests/test_qemu.py | 8 +- src/fc/qemu/tests/test_consul.py | 17 +-- src/fc/qemu/tests/vm_test.py | 58 ++------ 9 files changed, 201 insertions(+), 196 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 03ab432..dab1879 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,10 +1,12 @@ Release notes ============= -1.4.7 (unreleased) +1.5.0 (unreleased) ------------------ -- Nothing changed yet. +- Revamp guest agent connection handling and try harder to thaw a VM + that didn't timely manage to freeze. Increase freeze timeout, too. + (PL-132809) 1.4.6 (2024-09-26) diff --git a/src/fc/qemu/agent.py b/src/fc/qemu/agent.py index 9bbf7fd..d0ef734 100644 --- a/src/fc/qemu/agent.py +++ b/src/fc/qemu/agent.py @@ -1013,7 +1013,10 @@ def ensure_offline(self): self.stop() else: self.log.info( - "ensure-state", wanted="offline", found="offline", action="none" + "ensure-state", + wanted="offline", + found="offline", + action="none", ) def ensure_online_remote(self): @@ -1225,23 +1228,25 @@ def frozen_vm(self): """ frozen = False - try: - if self.qemu.is_running(): - self.log.info("freeze", volume="root") - try: - self.qemu.freeze() - frozen = True - except Exception as e: - self.log.error( - "freeze-failed", - reason=str(e), - action="continue", - machine=self.name, - ) - yield frozen - finally: - if self.qemu.is_running(): - self.ensure_thawed() + if self.qemu.is_running(): + self.log.info("freeze", volume="root") + try: + self.qemu.freeze() + frozen = True + except Exception as e: + self.log.error( + "freeze-failed", + reason=str(e), + action="continue", + machine=self.name, + ) + yield frozen + # We're not doing this with a finally because if qemu.freeze failed + # initially then we can't reliably communicate with the agent and + # there are measures already in place to assist in the unreliable + # case in `freeze()` + if frozen and self.qemu.is_running(): + self.ensure_thawed() @locked() @running(True) diff --git a/src/fc/qemu/hazmat/conftest.py b/src/fc/qemu/hazmat/conftest.py index 9bec993..f7c05c1 100644 --- a/src/fc/qemu/hazmat/conftest.py +++ b/src/fc/qemu/hazmat/conftest.py @@ -262,6 +262,8 @@ class ClientStub(object): messages_sent: typing.List[bytes] responses: typing.List[str] + receive_buffer = "" + def __init__(self): self.messages_sent = [] self.responses = [] @@ -278,6 +280,9 @@ def close(self): def send(self, msg: bytes): self.messages_sent.append(msg) + def recv(self, buffersize): + return self.receive_buffer + def makefile(self): pseudo_socket_filename = tempfile.mktemp(dir=tmpdir) with open(pseudo_socket_filename, "w") as f: diff --git a/src/fc/qemu/hazmat/guestagent.py b/src/fc/qemu/hazmat/guestagent.py index 36673c2..9257097 100644 --- a/src/fc/qemu/hazmat/guestagent.py +++ b/src/fc/qemu/hazmat/guestagent.py @@ -16,82 +16,129 @@ class GuestAgent(object): def __init__(self, machine, timeout, client_factory=socket.socket): self.machine = machine self.timeout = timeout - self.log = log.bind(machine=machine) + self.log = log.bind(machine=machine, subsystem="qemu/guestagent") self.file = None self.client_factory = client_factory self.client = None - def read(self): + def read(self, unwrap=True): """Reads single response from the GA and returns the result. Blocks and runs into a timeout if no response is available. """ - result = json.loads(self.file.readline()) + try: + result = self.file.readline() + except socket.timeout: + self.log.debug("read-timeout") + self.disconnect() + raise + result = json.loads(result) + self.log.debug("read", result=result) if "error" in result: raise ClientError(result) - return result["return"] + if unwrap: + # Some commands, like guest-ping do not return a result. But we do + # not want to silently swallow errors if a return value is missing + # but expected. + return result["return"] + # However, to ensure that things like the guest-ping did receive a + # proper result structure (e.g. {}) we do return it, so the command + # can detect whether everything is as expected. We explicitly do not + # just silently return `None` here. + return result - def cmd(self, cmd, flush_ga_parser=False, timeout=None, **args): + def cmd(self, cmd, timeout=None, fire_and_forget=False, **args): """Issues GA command and returns the result. - All **args need to be serialisable to JSON, that implies that `bytes` are *not* - valid. + + All **args need to be serialisable to JSON, that implies that `bytes` + are *not* valid. + """ + self.connect() message = json.dumps({"execute": cmd, "arguments": args}) - message = message.encode("ascii") - if flush_ga_parser: - # \xff is an invalid utf-8 character and recommended to safely - # ensure that the parser of the guest agent at the other end - # is reset to a known state. This is recommended for sync. - # http://wiki.qemu-project.org/index.php/Features/GuestAgent#guest-sync - message = b"\xff" + message - timeout = timeout or self.timeout - # Allow setting temporary timeouts for operations that are known to be - # slow. - self.client.settimeout(timeout) + message = message.encode("utf-8") self.client.send(message) - return self.read() + if not fire_and_forget: + self.client.settimeout(timeout or self.timeout) + return self.read(unwrap=(cmd != "guest-ping")) def sync(self): """Ensures that request and response are in order.""" - sync_id = random.randint(0, 0xFFFF) - n = 0 + + # Phase 1: ensure a low-level thaw command. This is an emergency safety + # belt. We really do not want the VM to be accidentally stuck in a + # frozen state. + self.log.debug("sync-gratuitous-thaw") + self.client.send( + json.dumps({"execute": "guest-fsfreeze-thaw"}).encode("utf-8") + ) + + # Phase 2: clear the connection buffer from previous connections. + # We set a very short timeout (nonblocking does not help, the kernel + # needs a little time to give access to the data already buffered). + # However, we want to keep it as short as possible because this timeout + # will always happen in the happy case which is most of the time. + self.client.settimeout(1) + self.log.debug("clear-buffer") try: - result = self.cmd("guest-sync", id=sync_id, flush_ga_parser=True) - except ClientError: - # we tripped a client error as we caused the guest agent to notice - # invalid json, which in turn triggers an error response - result = self.read() - except socket.error as e: - # Maybe a timeout. Keep trying a little bit harder. - result = str(e) - while n < 10: - if result == sync_id: - return - self.log.error( - "incorrect-sync-id", expected=sync_id, got=result, tries=n + while buffer := self.client.recv(4096): + self.log.debug("found-buffer-garbage", buffer=buffer) + except (BlockingIOError, socket.timeout): + self.log.debug("cleared-buffer") + + # Phase 3: ensure we see proper agent interactions. To be sure we + # test this with two diagnostic calls. The timeout can be higher now + # as we expect the agent to actually have to respond to us. + + # \xff is an invalid utf-8 character and recommended to safely + # ensure that the parser of the guest agent at the other end + # is reset to a known state. This is recommended for sync. + # http://wiki.qemu-project.org/index.php/Features/GuestAgent#guest-sync + self.log.debug("clear-guest-parser") + self.client.send(b"\xff") + + ping_result = self.cmd("guest-ping") + if ping_result != {}: + raise ClientError( + f"Agent did not respond properly to ping command: {ping_result}" ) - n += 1 - try: - result = self.read() - except (ClientError, socket.error): - # we tripped a client error later than right now. There may - # have been a response still in the queue. - pass + + sync_id = random.randint(0, 0xFFFF) + result = self.cmd("guest-sync", timeout=30, id=sync_id) + + self.log.debug("sync-response", expected=sync_id, got=result) + if result == sync_id: + return raise ClientError( - "Unable to sync with guest agent after {} tries.".format(n) + f"Unable to sync with guest agent. Got invalid sync_id {sync_id}" ) - def __enter__(self): + def connect(self): + if self.client and self.file: + return + self.disconnect() self.client = self.client_factory(socket.AF_UNIX, socket.SOCK_STREAM) - self.client.settimeout(self.timeout) self.client.connect("/run/qemu.{}.gqa.sock".format(self.machine)) self.file = self.client.makefile() fcntl.flock(self.file.fileno(), fcntl.LOCK_EX) self.sync() - return self - def __exit__(self, exc_type, exc_value, traceback): - self.file.close() - self.client.close() + def disconnect(self): + if self.file or self.client: + self.log.debug("disconnect") + try: + if self.file: + self.file.close() + except Exception: + pass + finally: + self.file = None + try: + if self.client: + self.client.close() + except Exception: + pass + finally: + self.client = None diff --git a/src/fc/qemu/hazmat/qemu.py b/src/fc/qemu/hazmat/qemu.py index f1881bc..665aba1 100644 --- a/src/fc/qemu/hazmat/qemu.py +++ b/src/fc/qemu/hazmat/qemu.py @@ -25,9 +25,11 @@ # timeout of 3 seconds will cause everything to explode when # the guest takes too long. We've seen 16 seconds as a regular # period in some busy and large machines. I'm being _very_ -# generous using a 120s timeout here. +# generous using a 5 minute timeout here. We've seen it get stuck longer +# than 2 minutes and the agent is very stubborn in those cases and really +# doesn't like if the client goes away ... # This is a global variable so we can instrument it during testing. -FREEZE_TIMEOUT = 120 +FREEZE_TIMEOUT = 300 class InvalidMigrationStatus(Exception): @@ -75,7 +77,9 @@ def locked(self, *args, **kw): finally: self._global_lock_count -= 1 self.log.debug( - "global-lock-status", target=LOCK, count=self._global_lock_count + "global-lock-status", + target=LOCK, + count=self._global_lock_count, ) if self._global_lock_count == 0: self.log.debug("global-lock-release", target=LOCK) @@ -186,6 +190,7 @@ def __enter__(self): pass def __exit__(self, exc_value, exc_type, exc_tb): + self.guestagent.disconnect() if self.__qmp: self.__qmp.close() self.__qmp = None @@ -361,32 +366,25 @@ def start(self): return def freeze(self): - with self.guestagent as guest: - try: - # This request may take a _long_ _long_ time and the default - # timeout of 3 seconds will cause everything to explode when - # the guest takes too long. We've seen 16 seconds as a regular - # period in some busy and large machines. I'm being _very_ - # generous using a 120s timeout here. - guest.cmd("guest-fsfreeze-freeze", timeout=FREEZE_TIMEOUT) - except ClientError: - self.log.debug("guest-fsfreeze-freeze-failed", exc_info=True) - assert guest.cmd("guest-fsfreeze-status") == "frozen" - - def _thaw_via_guest_agent(self): - with self.guestagent as guest: - try: - guest.cmd("guest-fsfreeze-thaw") - except ClientError: - self.log.debug("guest-fsfreeze-freeze-thaw", exc_info=True) - raise - result = guest.cmd("guest-fsfreeze-status") - if result != "thawed": - raise RuntimeError("Unexpected thaw result: {}".format(result)) + try: + # This request may take a _long_ _long_ time and the default + # timeout of 3 seconds will cause everything to explode when + # the guest takes too long. We've seen 16 seconds as a regular + # period in some busy and large machines. So we increase this + # to a lot more and also perform a gratuitous thaw in case + # we error out. + self.guestagent.cmd("guest-fsfreeze-freeze", timeout=FREEZE_TIMEOUT) + except ClientError: + self.log.debug("guest-fsfreeze-freeze-failed", exc_info=True) + self.guestagent.cmd("guest-fsfreeze-thaw", fire_and_forget=True) + assert self.guestagent.cmd("guest-fsfreeze-status") == "frozen" def thaw(self): try: - self._thaw_via_guest_agent() + self.guestagent.cmd("guest-fsfreeze-thaw") + result = self.guestagent.cmd("guest-fsfreeze-status") + if result != "thawed": + raise RuntimeError("Unexpected thaw result: {}".format(result)) except Exception: self.log.warning("guest-fsfreeze-thaw-failed", exc_info=True) raise @@ -394,19 +392,18 @@ def thaw(self): def write_file(self, path, content: bytes): if not isinstance(content, bytes): raise TypeError("Expected bytes, got string.") - with self.guestagent as guest: - try: - handle = guest.cmd("guest-file-open", path=path, mode="w") - guest.cmd( - "guest-file-write", - handle=handle, - # The ASCII armour needs to be turned into text again, because the - # JSON encoder doesn't handle bytes-like objects. - **{"buf-b64": encode(content, "base64").decode("ascii")}, - ) - guest.cmd("guest-file-close", handle=handle) - except ClientError: - self.log.error("guest-write-file", exc_info=True) + try: + handle = self.guestagent.cmd("guest-file-open", path=path, mode="w") + self.guestagent.cmd( + "guest-file-write", + handle=handle, + # The ASCII armour needs to be turned into text again, because the + # JSON encoder doesn't handle bytes-like objects. + **{"buf-b64": encode(content, "base64").decode("ascii")}, + ) + self.guestagent.cmd("guest-file-close", handle=handle) + except ClientError: + self.log.error("guest-write-file", exc_info=True) def inmigrate(self): self._start(["-incoming {}".format(self.migration_address)]) @@ -436,7 +433,8 @@ def migrate(self, address): ) self.qmp.command("migrate", uri=address) self.log.debug( - "migrate-parameters", **self.qmp.command("query-migrate-parameters") + "migrate-parameters", + **self.qmp.command("query-migrate-parameters"), ) def poll_migration_status(self, timeout=30): diff --git a/src/fc/qemu/hazmat/tests/guestagent_test.py b/src/fc/qemu/hazmat/tests/guestagent_test.py index 29b4d36..7df9e14 100644 --- a/src/fc/qemu/hazmat/tests/guestagent_test.py +++ b/src/fc/qemu/hazmat/tests/guestagent_test.py @@ -18,43 +18,36 @@ def test_ga_read_error(guest_agent): def test_ga_sync_immediate(guest_agent): guest_agent._client_stub.responses = [ + "{}", '{"return": 87643}', ] - with guest_agent: - # This causes an implicit sync and wires up the client stub. - assert guest_agent.file.fileno() - assert guest_agent.client is not None + guest_agent.connect() - assert guest_agent.client.messages_sent == [ - b'\xff{"execute": "guest-sync", "arguments": {"id": 87643}}' - ] - - -def test_ga_sync_retry(guest_agent): - guest_agent._client_stub.responses = [ - '{"return": 2}', - '{"return": 87643}', - ] - - with guest_agent: - # This causes an implicit sync and wires up the client stub. - assert True + # This causes an implicit sync and wires up the client stub. + assert guest_agent.file.fileno() + assert guest_agent.client is not None assert guest_agent.client.messages_sent == [ - b'\xff{"execute": "guest-sync", "arguments": {"id": 87643}}' + b'{"execute": "guest-fsfreeze-thaw"}', + b"\xff", + b'{"execute": "guest-ping", "arguments": {}}', + b'{"execute": "guest-sync", "arguments": {"id": 87643}}', ] -def test_ga_sync_too_often(guest_agent): +def test_ga_sync_wrong_response(guest_agent): guest_agent._client_stub.responses = [ - f'{{"return": {x}}}' for x in range(20) + "{}", + '{"return": 1}', ] with pytest.raises(ClientError): - with guest_agent: - pass + guest_agent.connect() assert guest_agent.client.messages_sent == [ - b'\xff{"execute": "guest-sync", "arguments": {"id": 87643}}' + b'{"execute": "guest-fsfreeze-thaw"}', + b"\xff", + b'{"execute": "guest-ping", "arguments": {}}', + b'{"execute": "guest-sync", "arguments": {"id": 87643}}', ] diff --git a/src/fc/qemu/hazmat/tests/test_qemu.py b/src/fc/qemu/hazmat/tests/test_qemu.py index 13e4312..c290f66 100644 --- a/src/fc/qemu/hazmat/tests/test_qemu.py +++ b/src/fc/qemu/hazmat/tests/test_qemu.py @@ -17,6 +17,8 @@ def test_write_file_no_error(guest_agent): # the emulated answers of the guest agent: guest_agent._client_stub.responses = [ + # ping + "{}", # sync ID, hard-coded in fixture '{"return": 87643}', # emulated non-empty result of executions: @@ -32,8 +34,12 @@ def test_write_file_no_error(guest_agent): qemu.write_file("/tmp/foo", b'"asdf"') print(guest_agent.client.messages_sent) + assert guest_agent.client.messages_sent == [ - b'\xff{"execute": "guest-sync", "arguments": {"id": 87643}}', + b'{"execute": "guest-fsfreeze-thaw"}', + b"\xff", + b'{"execute": "guest-ping", "arguments": {}}', + b'{"execute": "guest-sync", "arguments": {"id": 87643}}', b'{"execute": "guest-file-open", "arguments": {"path": "/tmp/foo", "mode": "w"}}', b'{"execute": "guest-file-write", "arguments": {"handle": "file-handle-1", "buf-b64": "ImFzZGYi\\n"}}', b'{"execute": "guest-file-close", "arguments": {"handle": "file-handle-1"}}', diff --git a/src/fc/qemu/tests/test_consul.py b/src/fc/qemu/tests/test_consul.py index ec00d02..c87cd3f 100644 --- a/src/fc/qemu/tests/test_consul.py +++ b/src/fc/qemu/tests/test_consul.py @@ -266,22 +266,9 @@ def test_snapshot_online_vm(vm): snapshot machine=simplevm snapshot=backy-1234 snapshot-create machine=simplevm name=backy-1234 freeze machine=simplevm volume=root -freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. +sync-gratuitous-thaw machine=simplevm subsystem=qemu/guestagent +freeze-failed action=continue machine=simplevm reason=timed out snapshot-ignore machine=simplevm reason=not frozen -ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu -Traceback (most recent call last): -... - raise RuntimeError("VM not frozen, not making snapshot.") -RuntimeError: VM not frozen, not making snapshot. - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): -... - raise ClientError( -fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. -ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries. handle-key-failed key=snapshot/7468743 Traceback (most recent call last): ... diff --git a/src/fc/qemu/tests/vm_test.py b/src/fc/qemu/tests/vm_test.py index 09278f0..701c6ac 100644 --- a/src/fc/qemu/tests/vm_test.py +++ b/src/fc/qemu/tests/vm_test.py @@ -494,6 +494,7 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): "snapshot", "freeze", "thaw", + "disconnect", ] monkeypatch.setattr(util, "today", lambda: datetime.date(2010, 1, 1)) @@ -511,30 +512,10 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): """\ snapshot-create machine=simplevm name=asdf-keep-until-20100108 freeze machine=simplevm volume=root -freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. -snapshot-ignore machine=simplevm reason=not frozen -ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu -Traceback (most recent call last): - File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm - yield frozen - File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot - raise RuntimeError("VM not frozen, not making snapshot.") -RuntimeError: VM not frozen, not making snapshot. - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw - self._thaw_via_guest_agent() - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent - with self.guestagent as guest: - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ - self.sync() - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync - raise ClientError( -fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. -ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" +sync-gratuitous-thaw machine=simplevm subsystem=qemu/guestagent +disconnect machine=simplevm subsystem=qemu/guestagent +freeze-failed action=continue machine=simplevm reason=timed out +snapshot-ignore machine=simplevm reason=not frozen""" ) == get_log() ) @@ -546,30 +527,10 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): """\ snapshot-create machine=simplevm name=asdf freeze machine=simplevm volume=root -freeze-failed action=continue machine=simplevm reason=... -snapshot-ignore machine=simplevm reason=not frozen -ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu -Traceback (most recent call last): - File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm - yield frozen - File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot - raise RuntimeError("VM not frozen, not making snapshot.") -RuntimeError: VM not frozen, not making snapshot. - -During handling of the above exception, another exception occurred: - -Traceback (most recent call last): - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw - self._thaw_via_guest_agent() - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent - with self.guestagent as guest: - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ - self.sync() - File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync - raise ClientError( -fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. -ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" +sync-gratuitous-thaw machine=simplevm subsystem=qemu/guestagent +disconnect machine=simplevm subsystem=qemu/guestagent +freeze-failed action=continue machine=simplevm reason=timed out +snapshot-ignore machine=simplevm reason=not frozen""" ) == get_log() ) @@ -1445,6 +1406,7 @@ def communicate_progress(p): simplevm received-ping timeout=60 simplevm reset-timeout simplevm waiting interval=0 remaining=... +simplevm guest-disconnect """ )