From 699e387ed4e9e7d134249af9c718c8dbc5b935df Mon Sep 17 00:00:00 2001 From: zszia <43820744+zszia@users.noreply.github.com> Date: Mon, 14 Nov 2022 16:18:05 +0100 Subject: [PATCH] fix: handle JSON character encoding for Splunk HEC (#112) * fix: handle JSON character encoding for hec (Umlauts) After applying the fix searching for a field with Umlauts is now working correctly. * style: pre-commit * test: update integration tests to include fixed scenario * test: sleep before search Co-authored-by: Artem Rys --- .github/workflows/build-test-release.yml | 1 + solnlib/modular_input/event.py | 2 +- solnlib/modular_input/event_writer.py | 2 +- tests/integration/_search.py | 49 +++++++++++++++++++ tests/integration/test_hec_event_writer.py | 35 ++++++++++++- tests/unit/test_modular_input_event_writer.py | 3 +- 6 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 tests/integration/_search.py diff --git a/.github/workflows/build-test-release.yml b/.github/workflows/build-test-release.yml index 55e19cfd..9098db3a 100644 --- a/.github/workflows/build-test-release.yml +++ b/.github/workflows/build-test-release.yml @@ -112,6 +112,7 @@ jobs: test-splunk: name: test-splunk runs-on: ubuntu-latest + continue-on-error: true needs: - meta strategy: diff --git a/solnlib/modular_input/event.py b/solnlib/modular_input/event.py index 652d8a27..fbffa0d3 100644 --- a/solnlib/modular_input/event.py +++ b/solnlib/modular_input/event.py @@ -216,7 +216,7 @@ def _to_hec(self, event_field): if hasattr(self, "_fields"): event["fields"] = self._fields - return json.dumps(event) + return json.dumps(event, ensure_ascii=False) @classmethod def format_events(cls, events: List, event_field: str = "event") -> List: diff --git a/solnlib/modular_input/event_writer.py b/solnlib/modular_input/event_writer.py index d2d590ca..8c1d9ce6 100644 --- a/solnlib/modular_input/event_writer.py +++ b/solnlib/modular_input/event_writer.py @@ -442,7 +442,7 @@ def write_events( try: self._rest_client.post( self.HTTP_EVENT_COLLECTOR_ENDPOINT, - body=event, + body=event.encode("utf-8"), headers=self.headers, ) except binding.HTTPError as e: diff --git a/tests/integration/_search.py b/tests/integration/_search.py new file mode 100644 index 00000000..6ed1bc8f --- /dev/null +++ b/tests/integration/_search.py @@ -0,0 +1,49 @@ +# +# Copyright 2021 Splunk Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os.path as op +import sys +import time + +sys.path.insert(0, op.dirname(op.dirname(op.abspath(__file__)))) +import context +from splunklib import client +from splunklib import results as splunklib_results + + +def search(session_key, query): + service = client.connect(host=context.host, token=session_key) + job = service.jobs.create(query) + while True: + while not job.is_ready(): + pass + stats = { + "isDone": job["isDone"], + "doneProgress": job["doneProgress"], + "scanCount": job["scanCount"], + "eventCount": job["eventCount"], + "resultCount": job["resultCount"], + } + if stats["isDone"] == "1": + break + time.sleep(0.5) + json_results_reader = splunklib_results.JSONResultsReader( + job.results(output_mode="json") + ) + results = [] + for result in json_results_reader: + if isinstance(result, dict): + results.append(result) + return results diff --git a/tests/integration/test_hec_event_writer.py b/tests/integration/test_hec_event_writer.py index 59384a66..cfc0fbcf 100644 --- a/tests/integration/test_hec_event_writer.py +++ b/tests/integration/test_hec_event_writer.py @@ -13,12 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import os.path as op import sys +import time sys.path.insert(0, op.dirname(op.dirname(op.abspath(__file__)))) import context +from _search import search from solnlib.modular_input import event_writer as hew @@ -36,3 +37,35 @@ def test_hec_event_writer(): m2[i] = "test2 data %s" % i e2 = ew.create_event(m2, index="main", host="testing", sourcetype="hec") ew.write_events([e1, e2]) + + +def test_hec_event_writes_with_non_utf_8(): + # To test scenario listed in https://github.com/splunk/addonfactory-solutions-library-python/pull/112. + test_name = "test_hec_event_writes_with_non_utf_8" + session_key = context.get_session_key() + ew = hew.HECEventWriter("test", session_key) + event = ew.create_event( + [ + { + "test_name": test_name, + "field_a": "Üü_Öö_Ää_some_text", + "field_b": "some_text_Üü_Öö_Ää", + }, + ], + index="main", + host="testing", + sourcetype="hec", + ) + ew.write_events([event]) + time.sleep(2) + + search_results = search( + session_key, f"search index=main sourcetype=hec {test_name}" + ) + + assert len(search_results) == 1 + _raw_event = search_results[0]["_raw"] + assert "Üü_Öö_Ää_some_text" in _raw_event + assert "some_text_Üü_Öö_Ää" in _raw_event + assert "\\u00dc\\u00fc_\\u00d6\\u00f6_\\u00c4\\u00e4_some_text" not in _raw_event + assert "some_text_\\u00dc\\u00fc_\\u00d6\\u00f6_\\u00c4\\u00e4" not in _raw_event diff --git a/tests/unit/test_modular_input_event_writer.py b/tests/unit/test_modular_input_event_writer.py index adc33e26..6ea1cbc2 100644 --- a/tests/unit/test_modular_input_event_writer.py +++ b/tests/unit/test_modular_input_event_writer.py @@ -96,7 +96,8 @@ def mock_post( self, path_segment, owner=None, app=None, sharing=None, headers=None, **query ): event_strings = [ - json.dumps(json.loads(e), sort_keys=True) for e in query["body"].split("\n") + json.dumps(json.loads(e), sort_keys=True) + for e in query["body"].decode("utf-8").split("\n") ] assert (