fix: improve data cleaning and error handling in Streamlit display

ist356 · Nov 20, 2024 · 9046343 · 9046343
1 parent 9786ffd
commit 9046343
Showing 1 changed file with 9 additions and 2 deletions.
diff --git a/src/ia_collection_analyzer/streamlit.py b/src/ia_collection_analyzer/streamlit.py
@@ -57,6 +57,7 @@
 data_transform_text = st.text("cleaning data...")
 # drop columns with 80%+ nan
 items_pd = items_pd.dropna(axis=1, thresh=0.8 * len(items_pd))
+items_pd = items_pd.dropna(axis=0, thresh=0.7 * len(items_pd.columns))
 # drop columns with different types inner.
 # for col in items_pd.columns:
 #    items_pd[col] = items_pd[col].apply(lambda x: x if isinstance(x, type(items_pd[col][0])) else np.nan)
@@ -68,7 +69,12 @@
 data_transform_text.text("Data transformation and cleaning complete!")
 
 st.write("The collection contains the following items:")
-st.write(items_pd.head(10))  # display the first 10 rows of the dataframe
+try:
+    st.write(items_pd.head(10))
+except Exception as e:
+    st.markdown("Failed to display top 10 lines. Only first will be shown.")
+    st.write(items_pd.head(1))
+    st.write(e)
 
 st.header("Selecting columns to analyze")
 st.write("Select additional columns you want to analyze:")
@@ -78,6 +84,7 @@
 selected_columns = st.multiselect("Select columns:", seleactable_columns, default=[])
 
 filtered_pd = items_pd[REQUIRED_METADATA + selected_columns]
+filtered_pd = filtered_pd.dropna(axis=0, how="any")
 
 st.write("Preview of the selected columns:")
-st.write(items_pd.head(10))
+st.write(filtered_pd.head(30))