From ae077f8e9e92e164e574d3c3f2e08fd33f767d85 Mon Sep 17 00:00:00 2001 From: arroyo38 Date: Thu, 19 Dec 2024 15:34:18 -0500 Subject: [PATCH] EDA document should appear now --- tools-appendix/modules/python/nav.adoc | 64 +--- tools-appendix/modules/python/pages/eda.adoc | 296 ++++++++++++++++++ .../modules/python/pages/index.adoc | 2 +- 3 files changed, 310 insertions(+), 52 deletions(-) create mode 100644 tools-appendix/modules/python/pages/eda.adoc diff --git a/tools-appendix/modules/python/nav.adoc b/tools-appendix/modules/python/nav.adoc index 7035a7341..9878d3f68 100644 --- a/tools-appendix/modules/python/nav.adoc +++ b/tools-appendix/modules/python/nav.adoc @@ -1,52 +1,14 @@ * xref:index.adoc[Python] -** xref:python-starter-skills-roadmap.adoc[Python Starter Skills Roadmap] -** xref:indentation.adoc[Indentation] -** xref:variables.adoc[Variables] -** xref:printing-and-f-strings.adoc[Printing and F-Strings] -** xref:logical-operators.adoc[Logical Operators] -** xref:tuples.adoc[Tuples] -** xref:lists.adoc[Lists] -** xref:dictionaries.adoc[Dictionaries] -** xref:sets.adoc[Sets] -** xref:control-flow.adoc[Control Flow] -** xref:writing-functions.adoc[Writing Functions] -** xref:classes.adoc[Classes] -** xref:writing-scripts.adoc[Writing Scripts] -*** xref:argparse.adoc[argparse] -** xref:pandas-intro.adoc[pandas] -*** xref:pandas-read-write-data.adoc[Reading & Writing Data] -*** xref:pandas-series.adoc[Series] -*** xref:pandas-dataframes.adoc[DataFrames] -*** xref:pandas-indexing.adoc[Indexing] -*** xref:pandas-dates-and-times.adoc[Dates and Times] -*** xref:pandas-aggregate-functions.adoc[Aggregate Functions] -*** xref:pandas-reshaping.adoc[Reshaping] -** xref:python-scraping.adoc[Scraping] -*** xref:requests.adoc[Requests] -*** xref:lxml.adoc[lxml] -*** xref:selenium.adoc[Selenium] -*** xref:web-scraping-anvil.adoc[Running on Anvil] -** xref:plotting.adoc[Plotting] -*** xref:matplotlib.adoc[Matplotlib] -*** xref:plotly-examples.adoc[Plotly] -** xref:documentation.adoc[Documentation] -*** xref:docstrings-and-comments.adoc[Docstrings & Comments] -*** xref:pdoc.adoc[pdoc] -*** xref:sphinx.adoc[Sphinx] -** xref:testing.adoc[Testing] -*** xref:pytest.adoc[pytest] -*** xref:mypy.adoc[mypy] -** xref:serialization-and-deserialization.adoc[Serialization & Deserialization] -*** xref:messagepack.adoc[MessagePack] -** xref:dask.adoc[Dask] -** xref:jax.adoc[JAX] -** xref:python-package-management.adoc[Package Management] -*** xref:package-management-fundamentals.adoc[Package Management Fundametals] -*** xref:pypi.adoc[PyPi] -*** xref:pip.adoc[Pip] -*** xref:virtualenv.adoc[Virtualenv] -*** xref:pipenv.adoc[Pipenv] -*** xref:poetry.adoc[Poetry] -*** xref:anaconda.adoc[Anaconda] -** https://codingbat.com/python[Python Coding Examples (Coding Bat)] -** https://docs.python.org/3/[Python Official Documentation] \ No newline at end of file +* xref:introduction-to-jupyter-lab.adoc[Introduction to Jupyter] +* xref:basics-programming.adoc[Basics of Programming] +* xref:lists-dictionaries-tuples-loops.adoc[Introduction: Lists, Tuples, Dictionaries] +* xref:eda.adoc[Basics of Exploratory Data Analysis (EDA)] +* xref:control-flow.adoc[Control Flow] +* xref:filtering-and-selecting.adoc[Filtering and Selecting] +* xref:matplotlib.adoc[Data Visualization with matplotlib] +* xref:plotly-examples.adoc[Data Visualization with plotly] +* xref:writing-functions.adoc[Writing Functions in Python] +* xref:writing-scripts.adoc[Writing Scripts in Python] +* xref:pandas-dates-and-times.adoc[Handling Dates and Times in pandas] +* xref:pandas-aggregate-functions.adoc[Applying Aggregate Functions in pandas] +* xref:pandas-reshaping.adoc[Reshaping Data in pandas] \ No newline at end of file diff --git a/tools-appendix/modules/python/pages/eda.adoc b/tools-appendix/modules/python/pages/eda.adoc new file mode 100644 index 000000000..e33b9ac0c --- /dev/null +++ b/tools-appendix/modules/python/pages/eda.adoc @@ -0,0 +1,296 @@ += Getting Started with Exploratory Data Analysis + +Exploratory Data Analysis or EDA is one of the most important steps when understanding your data. During EDA you learn about the data that you have available to you, and develop some questions or goals with the data (if you don't already have them). + +The most common operations start with reading data into a DataFrame, accessing the DataFrames’s attributes, and using the DataFrame’s methods to perform operations on the underlying data or with other DataFrames. + + + +In this document we will: + +* Show how to load and inspect data using pandas (`read_csv`, `head`, `len`, `shape`). + +* Explore data attributes (`columns`, `unique`, `value_counts`, `isin`) + +* Perform data transformations (`rename`, `iloc`) + +* Summarize datasets with `describe()` + +== Basic Functions in EDA + +Here we list some commonly used functions used for EDA and DataFrames. You can explore how they get used in the code examples below. + +NOTE: The following functions are for the pandas package, but most data manipulation packages (like numpy) will have similar functionality, sometimes with the same names even. + +- head() +- describe() +- len() +- columns +- shape +- iloc +- unique() +- info() +- value_counts() +- isin() + +== Reading in the Data +The `pandas` library provides various functions to load tabular data into a DataFrame. Since our dataset is in CSV format (which uses a comma as the default delimiter), we'll use the `read_csv()` function. For additional methods to import data, refer to this resource: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html[pandas official documentation]. + +Please watch the videos below for an additional walk through: +++++ + +++++ + +++++ + +++++ + + +We will start by reading in the airports dataset. + +`/anvil/projects/tdm/data/flights/subset/airports.csv` + +[source, python] +---- +import pandas as pd +myDF = pd.read_csv("/anvil/projects/tdm/data/flights/subset/airports.csv") +---- + +Let's use the `.head()` function to view its contents. + +[source, python] +---- +myDF.head() +---- + +---- + iata airport city state country lat long +0 00M Thigpen Bay Springs MS USA 31.953765 -89.234505 +1 00R Livingston Municipal Livingston TX USA 30.685861 -95.017928 +2 00V Meadow Lake Colorado Springs CO USA 38.945749 -104.569893 +3 01G Perry-Warsaw Perry NY USA 42.741347 -78.052081 +4 01J Hilliard Airpark Hilliard FL USA 30.688012 -81.905944 +---- + +**What if I wanted to display the first `n` rows of my DataFrame?** + +In this case Pandas has a hand built-in `head` function. By default `head` will return the first 5 rows. We can also pass an `n=` argument to the function if we want a different number of rows: + +[source, python] +---- +print(myDF.head(n=2)) +---- + +---- + iata airport city state country lat long +0 00M Thigpen Bay Springs MS USA 31.953765 -89.234505 +1 00R Livingston Municipal Livingston TX USA 30.685861 -95.017928 +---- + + + +== Size of the Data + +If we wanted to extract the number of rows in the dataset we could use the `len()` function. + + +[source, python] +---- +len(myDF) +---- + +Now let's use `.shape` function to see how many rows AND columns we have in our data. + + +[source, python] +---- +# Getting the number of rows and columns +print(myDF.shape) +---- + +This returns a tuple with the first value as the number of rows and the second as the number of columns: + +---- +(3376, 7) +---- + +== Data Extraction +Let's say we wanted to just view the 11th row in the dataframe, you can use the .iloc function in pandas. Note that .iloc uses zero-based indexing, so the 11th row corresponds to index 10. +[source, python] +---- +myDF.iloc[10,] +---- + +---- +iata 04M +airport Calhoun County +city Pittsboro +state MS +country USA +lat 33.930112 +long -89.342852 +Name: 10, dtype: object +---- + +== The Variables in the Data + +Now let's use `.columns` function to see the names of the columns we have in our dataset. + + +[source, python] +---- +myDF.columns +---- + +---- +Index(['iata', 'airport', 'city', 'state', 'country', 'lat', 'long'], dtype='object') +---- + +The variables in this dataset are: + +* iata - abbreviation used to identify airline +* airport - airport name +* city - The city where the airport is located +* state - The state where the airport is located +* country - The country where the airport is located +* lat - latitude +* long - longitude + + +== Renaming Columns + +**What if I wanted to change the name of one of my columns?** + +[source, python] +---- +myDF = myDF.rename(columns={'long': 'lon'}) +---- + +You could also add the `inplace=True` argument to make the change directly to the DataFrame: + +[source, python] +---- +myDF.rename(columns={'long': 'lon'}, inplace=True) +---- + +Either method would result in the `long` column being renamed to `lon` in this example. + +[source, python] +---- +myDF.rename(columns={'long': 'lon'}, inplace=True) +print(myDF.columns) +---- + +---- +Index(['iata', 'airport', 'city', 'state', 'country', 'lat', 'lon'], dtype='object') +---- + + +== Dataset Summary with describe() + +[source, python] +---- +myDF.describe() +---- + +---- + lat lon +count 3376.000000 3376.000000 +mean 40.036524 -98.621205 +std 8.329559 22.869458 +min 7.367222 -176.646031 +25% 34.688427 -108.761121 +50% 39.434449 -93.599425 +75% 43.372612 -84.137519 +max 71.285448 145.621384 +---- + +The `describe()` function in pandas generates a summary of descriptive statistics for numeric columns in the dataset. Based on our output, we can see that the average latitude in our dataset is approximately 40.04, while the average longitude is around -98.62. This suggests that, on average, the airports in this dataset are located in the central United States, as these coordinates fall near the center of the United States. + +== Unique values in a column + +Additionally, if we wanted to see how many unique countries we have in the airports dataset we could use the `unique` function. When performing EDA, it's often useful to understand the quantity and uniqueness of a specific category, making this function particularly useful. + +[source, python] +---- +unique_countries = myDF['country'].unique() +print(unique_countries) +---- + +---- +['USA' 'Thailand' 'Palau' 'N Mariana Islands' + 'Federated States of Micronesia'] +---- + +Understanding the columns in your dataset is a critical step when conducting initial exploratory data analysis. + +== Value Counts +When working with categorical data, value counts is also a useful function. The function values_counts() returns the number of times each value appears in the column. The output will be sorted in descending order when using `value_counts()`. + + +[source, python] +---- +airport_counts = myDF['airport'].value_counts() #One categorical variable +print(airport_counts) +---- + +---- +airport +Municipal 5 +Jackson County 5 +Monroe County 5 +Lancaster 4 +Plymouth Municipal 4 + .. +Chehalis-Centralia 1 +Charlotte/Douglas International 1 +Clearwater Air Park 1 +Camarillo 1 +Zanesville Municipal 1 +Name: count, Length: 3245, dtype: int64 +---- + +[source, python] +---- +country_airport_counts = myDF[['country', 'airport']].value_counts() #Two categorical variables +print(country_airport_counts) +---- + +---- +country airport +USA Jackson County 5 + Monroe County 5 + Municipal 5 + Plymouth Municipal 4 + Lancaster 4 + .. + Georgetown-Scott County 1 + Geraldine 1 + Gettysburg & Travel Center 1 + Gettysburg Municipal 1 + Zephyrhills Municipal 1 +Name: count, Length: 3245, dtype: int64 +---- + +== Is In +The isin() function checks whether each element meets the condition specified by returning a boolean series. + +[source, python] +---- +myDF['country'].isin(['Thailand']) +---- + +---- +0 False +1 False +2 False +3 False +4 False + ... +3371 False +3372 False +3373 False +3374 False +3375 False +Name: country, Length: 3376, dtype: bool +---- diff --git a/tools-appendix/modules/python/pages/index.adoc b/tools-appendix/modules/python/pages/index.adoc index 3b3034a2b..46d2e7791 100644 --- a/tools-appendix/modules/python/pages/index.adoc +++ b/tools-appendix/modules/python/pages/index.adoc @@ -11,7 +11,7 @@ Python is largely known for its readability and versatility. Its design philosop * xref:introduction-to-jupyter-lab.adoc[Introduction to Jupyter] * xref:basics-programming.adoc[Basics of Programming] * xref:lists-dictionaries-tuples-loops.adoc[Introduction: Lists, Tuples, Dictionaries] -* xref:eda-initial.adoc[Basics of Exploratory Data Analysis (EDA)] +* xref:eda.adoc[Basics of Exploratory Data Analysis (EDA)] * xref:control-flow.adoc[Control Flow] * xref:filtering-and-selecting.adoc[Filtering and Selecting] * xref:matplotlib.adoc[Data Visualization with matplotlib]