project-koku · maskarb · Apr 22, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 15, 2024
@@ -70,18 +70,21 @@ class KafkaMsgHandlerError(Exception):
     """Kafka msg handler error."""
 
 
-def divide_csv_daily(file_path: os.PathLike, manifest_id: int):
-    """
-    Split local file into daily content.
-    """
-    daily_files = []
-
+def get_data_frame(file_path: os.PathLike):
+    """read csv file into dataframe"""
     try:
-        data_frame = pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow"))
+        return pd.read_csv(file_path, dtype=pd.StringDtype(storage="pyarrow"), on_bad_lines="warn")
     except Exception as error:
         LOG.error(f"File {file_path} could not be parsed. Reason: {str(error)}")
         raise error
 
+
+def divide_csv_daily(file_path: os.PathLike, manifest_id: int):
+    """
+    Split local file into daily content.
+    """
+    data_frame = get_data_frame(file_path)
+
     report_type, _ = utils.detect_type(file_path)
     unique_times = data_frame.interval_start.unique()
     days = list({cur_dt[:10] for cur_dt in unique_times})
@@ -90,6 +93,7 @@ def divide_csv_daily(file_path: os.PathLike, manifest_id: int):
         for cur_day in days
     ]
 
+    daily_files = []
     for daily_data in daily_data_frames:
         day = daily_data.get("date")
         df = daily_data.get("data_frame")

@@ -0,0 +1,3 @@
+h1,h2
+col1,col2
+col1,col2,col3
@@ -0,0 +1,3 @@
+h1,h2
+col1,col2
+col1,col2
@@ -941,6 +941,19 @@ def test_divide_csv_daily(self):
                     for expected_item in expected:
                         self.assertIn(expected_item, daily_files)
 
+    def test_get_data_frame_no_tokenizing_error(self):
+        """Test get_data_frame does not raise Tokenizing error when reading files."""
+
+        file_paths = [
+            Path("./koku/masu/test/data/ocp/valid-csv.csv"),
+            Path("./koku/masu/test/data/ocp/tokenizing-error.csv"),
+        ]
+        for file_path in file_paths:
+            try:
+                msg_handler.get_data_frame(file_path)
+            except Exception:
+                self.fail(f"failed to read: {file_path}")
+
     @patch("masu.external.kafka_msg_handler.os")
     @patch("masu.external.kafka_msg_handler.copy_local_report_file_to_s3_bucket")
     @patch("masu.external.kafka_msg_handler.divide_csv_daily")