Skip to content

Commit

Permalink
chore: move code to cache the calculated pd for speedup
Browse files Browse the repository at this point in the history
  • Loading branch information
Ovler-Young committed Nov 20, 2024
1 parent bff58b1 commit b18f067
Showing 1 changed file with 17 additions and 17 deletions.
34 changes: 17 additions & 17 deletions src/ia_collection_analyzer/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@
)
items = fetch_metadata(collection_id)
items_pd = pd.DataFrame(items)

data_transform_text = st.text("cleaning data...")
# drop columns with 80%+ nan
items_pd = items_pd.dropna(axis=1, thresh=0.8 * len(items_pd))
items_pd = items_pd.dropna(axis=0, thresh=0.7 * len(items_pd.columns))
# drop mediatype=collections
items_pd = items_pd[items_pd["mediatype"] != "collection"]

# drop columns with different types inner.
# for col in items_pd.columns:
# items_pd[col] = items_pd[col].apply(lambda x: x if isinstance(x, type(items_pd[col][0])) else np.nan)

# calculate metadata
data_transform_text.text("calculating metadata...")
items_pd["addeddate"] = pd.to_datetime(items_pd["addeddate"])
items_pd["publicdate"] = pd.to_datetime(items_pd["publicdate"])
data_transform_text.text("Data transformation and cleaning complete!")

# Update cache
st.session_state.collection_id = collection_id
Expand All @@ -54,23 +71,6 @@
)
items_pd = st.session_state.items_pd

data_transform_text = st.text("cleaning data...")
# drop columns with 80%+ nan
items_pd = items_pd.dropna(axis=1, thresh=0.8 * len(items_pd))
items_pd = items_pd.dropna(axis=0, thresh=0.7 * len(items_pd.columns))
# drop mediatype=collections
items_pd = items_pd[items_pd["mediatype"] != "collection"]

# drop columns with different types inner.
# for col in items_pd.columns:
# items_pd[col] = items_pd[col].apply(lambda x: x if isinstance(x, type(items_pd[col][0])) else np.nan)

# calculate metadata
data_transform_text.text("calculating metadata...")
items_pd["addeddate"] = pd.to_datetime(items_pd["addeddate"])
items_pd["publicdate"] = pd.to_datetime(items_pd["publicdate"])
data_transform_text.text("Data transformation and cleaning complete!")

st.write("The collection contains the following items:")
try:
st.write(items_pd.head(10))
Expand Down

0 comments on commit b18f067

Please sign in to comment.