diff --git a/4population scatter plot rugalde 211124.png b/4population scatter plot rugalde 211124.png
new file mode 100644
index 000000000..4a0f57312
Binary files /dev/null and b/4population scatter plot rugalde 211124.png differ
diff --git a/Copy_of_projectRealStateCleanUP__solved__notpushed.ipynb b/Copy_of_projectRealStateCleanUP__solved__notpushed.ipynb
new file mode 100644
index 000000000..47a91415d
--- /dev/null
+++ b/Copy_of_projectRealStateCleanUP__solved__notpushed.ipynb
@@ -0,0 +1,1367 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "innocent-university",
+ "metadata": {
+ "id": "innocent-university"
+ },
+ "source": [
+ "# Real Estate Clean up\n",
+ "\n",
+ "This is a real dataset, and it was downloaded using web scraping techniques. The data contains registers from **Fotocasa** which is one of the most popular real estate websites in Spain. Please, do not do this (web scraping) unless it is for academic purposes.\n",
+ "\n",
+ "The dataset was downloaded a few years ago by Henry Navarro, and in no case were economic returns obtained from it.\n",
+ "\n",
+ "It contains thousands of data from real houses published on the web www.fotocasa.com. Your goal is to extract as much information as possible with the knowledge you have so far about data science, for example what is the most expensive house in the entire dataset?\n",
+ "\n",
+ "Let's start with precisely that question... Good luck!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "multiple-glass",
+ "metadata": {
+ "id": "multiple-glass"
+ },
+ "source": [
+ "#### Exercise 00. Read the dataset assets/real_estate.csv and try to visualize the table (★☆☆)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "frank-heath",
+ "metadata": {
+ "id": "frank-heath",
+ "outputId": "0732f823-a4fc-4ced-a719-1459716d3cb4",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 652
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " Unnamed: 0 id_realEstates isNew realEstate_name \\\n",
+ "0 1 153771986 False ferrari 57 inmobiliaria \n",
+ "1 2 153867863 False tecnocasa fuenlabrada ferrocarril \n",
+ "2 3 153430440 False look find boadilla \n",
+ "3 4 152776331 False tecnocasa fuenlabrada ferrocarril \n",
+ "4 5 153180188 False ferrari 57 inmobiliaria \n",
+ "... ... ... ... ... \n",
+ "15330 15331 153901377 False infocasa consulting \n",
+ "15331 15332 150394373 False inmobiliaria pulpon \n",
+ "15332 15333 153901397 False tecnocasa torrelodones \n",
+ "15333 15334 152607440 False inmobiliaria pulpon \n",
+ "15334 15335 153901356 False infocasa consulting \n",
+ "\n",
+ " phone_realEstate url_inmueble \\\n",
+ "0 912177526.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "1 916358736.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "2 916350408.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "3 916358736.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "4 912177526.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "... ... ... \n",
+ "15330 911360461.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "15331 912788039.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "15332 912780348.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "15333 912788039.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "15334 911360461.0 https://www.fotocasa.es/es/comprar/vivienda/ma... \n",
+ "\n",
+ " rooms bathrooms surface price ... level4Id level5Id level6Id \\\n",
+ "0 3.0 2.0 103.0 195000 ... 0 0 0 \n",
+ "1 3.0 1.0 NaN 89000 ... 0 0 0 \n",
+ "2 2.0 2.0 99.0 390000 ... 0 0 0 \n",
+ "3 3.0 1.0 86.0 89000 ... 0 0 0 \n",
+ "4 2.0 2.0 106.0 172000 ... 0 0 0 \n",
+ "... ... ... ... ... ... ... ... ... \n",
+ "15330 2.0 1.0 96.0 259470 ... 0 0 0 \n",
+ "15331 3.0 1.0 150.0 165000 ... 0 0 0 \n",
+ "15332 4.0 2.0 175.0 495000 ... 0 0 0 \n",
+ "15333 3.0 2.0 101.0 195000 ... 0 0 0 \n",
+ "15334 3.0 2.0 152.0 765000 ... 0 0 0 \n",
+ "\n",
+ " level7Id level8Id accuracy latitude longitude zipCode \\\n",
+ "0 0 0 0 40,2948276786438 -3,44402412135624 NaN \n",
+ "1 0 0 1 40,28674 -3,79351 NaN \n",
+ "2 0 0 0 40,4115646786438 -3,90662252135624 NaN \n",
+ "3 0 0 0 40,2853785786438 -3,79508142135624 NaN \n",
+ "4 0 0 0 40,2998774864376 -3,45226301356237 NaN \n",
+ "... ... ... ... ... ... ... \n",
+ "15330 0 0 0 40,45416 -3,70286 NaN \n",
+ "15331 0 0 0 40,36652 -3,48951 NaN \n",
+ "15332 0 0 0 40,57444 -3,92124 NaN \n",
+ "15333 0 0 0 40,36967 -3,48105 NaN \n",
+ "15334 0 0 0 40,45773 -3,69068 NaN \n",
+ "\n",
+ " customZone \n",
+ "0 NaN \n",
+ "1 NaN \n",
+ "2 NaN \n",
+ "3 NaN \n",
+ "4 NaN \n",
+ "... ... \n",
+ "15330 NaN \n",
+ "15331 NaN \n",
+ "15332 NaN \n",
+ "15333 NaN \n",
+ "15334 NaN \n",
+ "\n",
+ "[15335 rows x 37 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " id_realEstates | \n",
+ " isNew | \n",
+ " realEstate_name | \n",
+ " phone_realEstate | \n",
+ " url_inmueble | \n",
+ " rooms | \n",
+ " bathrooms | \n",
+ " surface | \n",
+ " price | \n",
+ " ... | \n",
+ " level4Id | \n",
+ " level5Id | \n",
+ " level6Id | \n",
+ " level7Id | \n",
+ " level8Id | \n",
+ " accuracy | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " zipCode | \n",
+ " customZone | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 153771986 | \n",
+ " False | \n",
+ " ferrari 57 inmobiliaria | \n",
+ " 912177526.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 103.0 | \n",
+ " 195000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,2948276786438 | \n",
+ " -3,44402412135624 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 153867863 | \n",
+ " False | \n",
+ " tecnocasa fuenlabrada ferrocarril | \n",
+ " 916358736.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 89000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 40,28674 | \n",
+ " -3,79351 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 153430440 | \n",
+ " False | \n",
+ " look find boadilla | \n",
+ " 916350408.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 99.0 | \n",
+ " 390000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,4115646786438 | \n",
+ " -3,90662252135624 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 152776331 | \n",
+ " False | \n",
+ " tecnocasa fuenlabrada ferrocarril | \n",
+ " 916358736.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 86.0 | \n",
+ " 89000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,2853785786438 | \n",
+ " -3,79508142135624 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 153180188 | \n",
+ " False | \n",
+ " ferrari 57 inmobiliaria | \n",
+ " 912177526.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 106.0 | \n",
+ " 172000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,2998774864376 | \n",
+ " -3,45226301356237 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 15330 | \n",
+ " 15331 | \n",
+ " 153901377 | \n",
+ " False | \n",
+ " infocasa consulting | \n",
+ " 911360461.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 96.0 | \n",
+ " 259470 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,45416 | \n",
+ " -3,70286 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 15331 | \n",
+ " 15332 | \n",
+ " 150394373 | \n",
+ " False | \n",
+ " inmobiliaria pulpon | \n",
+ " 912788039.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 150.0 | \n",
+ " 165000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,36652 | \n",
+ " -3,48951 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 15332 | \n",
+ " 15333 | \n",
+ " 153901397 | \n",
+ " False | \n",
+ " tecnocasa torrelodones | \n",
+ " 912780348.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 175.0 | \n",
+ " 495000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,57444 | \n",
+ " -3,92124 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 15333 | \n",
+ " 15334 | \n",
+ " 152607440 | \n",
+ " False | \n",
+ " inmobiliaria pulpon | \n",
+ " 912788039.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 101.0 | \n",
+ " 195000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,36967 | \n",
+ " -3,48105 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 15334 | \n",
+ " 15335 | \n",
+ " 153901356 | \n",
+ " False | \n",
+ " infocasa consulting | \n",
+ " 911360461.0 | \n",
+ " https://www.fotocasa.es/es/comprar/vivienda/ma... | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 152.0 | \n",
+ " 765000 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 40,45773 | \n",
+ " -3,69068 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
15335 rows × 37 columns
\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "ds"
+ }
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# This CSV file contains semicolons instead of comas as separator\n",
+ "ds = pd.read_csv('/content/sample_data/real_estate.csv', sep=';')\n",
+ "ds\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "latin-guest",
+ "metadata": {
+ "id": "latin-guest"
+ },
+ "source": [
+ "#### Exercise 01. Which is the most expensive house in the dataset? (★☆☆)\n",
+ "\n",
+ "Print the address and the price of the selected house. For example:\n",
+ "\n",
+ "`The house with address General Street Nº5 is the most expensive and its price is 5000000 USD`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "developing-optimum",
+ "metadata": {
+ "id": "developing-optimum",
+ "outputId": "784b166a-dae9-44c3-9467-b5d39b69a231",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "The house with address El Escorial is the most expensive and its price is 8500000 USD.\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "\n",
+ "# Find the most expensive house\n",
+ "most_expensive = ds.loc[ds['price'].idxmax()] # Assuming 'price' is the column with house prices\n",
+ "\n",
+ "# Display the details of the most expensive house\n",
+ "address = most_expensive['address'] # Assuming 'address' is the column for house addresses\n",
+ "price = most_expensive['price']\n",
+ "\n",
+ "print(f\"The house with address {address} is the most expensive and its price is {price} USD.\")\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "lesser-cosmetic",
+ "metadata": {
+ "id": "lesser-cosmetic"
+ },
+ "source": [
+ "#### Exercise 02. Which is the cheapest house in the dataset? (★☆☆)\n",
+ "\n",
+ "Print the address and the price of the selected house. For example:\n",
+ "\n",
+ "`The house with address Concrete Street Nº1 is the cheapest and its price is 12000 USD`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "lovely-oasis",
+ "metadata": {
+ "id": "lovely-oasis",
+ "outputId": "d7ff7a44-e558-4b03-ca8c-869d550b9565",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ }
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "The house with address Parla is the cheapest and its price is 0 USD.\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "import pandas as pd\n",
+ "\n",
+ "# This CSV file contains semicolons instead of comas as separator\n",
+ "ds = pd.read_csv('/content/sample_data/real_estate.csv', sep=';')\n",
+ "ds\n",
+ "\n",
+ "# Find the cheapest house\n",
+ "cheapest_house = ds.loc[ds['price'].idxmin()] # Assuming 'price' is the column with house prices\n",
+ "\n",
+ "# Extract details of the cheapest house\n",
+ "address = cheapest_house['address'] # Assuming 'address' is the column for house addresses\n",
+ "price = cheapest_house['price']\n",
+ "\n",
+ "# Print the result\n",
+ "print(f\"The house with address {address} is the cheapest and its price is {price} USD.\")\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "compliant-fellowship",
+ "metadata": {
+ "id": "compliant-fellowship"
+ },
+ "source": [
+ "#### Exercise 03. Which is the biggest and the smallest house in the dataset? (★☆☆)\n",
+ "\n",
+ "Print both the address and the surface of the selected houses. For example:\n",
+ "\n",
+ "`The biggest house is located on Yukka Street Nº10 and its surface is 5000 meters`\n",
+ "\n",
+ "`The smallest house is located on County Road 1 N and its surface is 200 meters`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "every-tiffany",
+ "metadata": {
+ "id": "every-tiffany"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "danish-spirit",
+ "metadata": {
+ "id": "danish-spirit"
+ },
+ "source": [
+ "#### Exercise 04. How many populations (level5 column) the dataset contains? (★☆☆)\n",
+ "\n",
+ "Print the names of the populations with a comma as a separator. For example:\n",
+ "\n",
+ "`> print(populations)`\n",
+ "\n",
+ "`population1, population2, population3, ...`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "exciting-accreditation",
+ "metadata": {
+ "id": "exciting-accreditation"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "crazy-blame",
+ "metadata": {
+ "id": "crazy-blame"
+ },
+ "source": [
+ "#### Exercise 05. Does the dataset contain NAs? (★☆☆)\n",
+ "\n",
+ "Print a boolean value (`True` or `False`) followed by the rows/cols that contains NAs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "transparent-poetry",
+ "metadata": {
+ "id": "transparent-poetry"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "italic-hydrogen",
+ "metadata": {
+ "id": "italic-hydrogen"
+ },
+ "source": [
+ "#### Exercise 06. Delete the NAs of the dataset, if applicable (★★☆)\n",
+ "\n",
+ "Print a comparison between the dimensions of the original DataFrame versus the DataFrame after the deletions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "administrative-roads",
+ "metadata": {
+ "id": "administrative-roads"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "middle-china",
+ "metadata": {
+ "id": "middle-china"
+ },
+ "source": [
+ "#### Exercise 07. Which is the mean of prices in the population (level5 column) of \"Arroyomolinos (Madrid)\"? (★★☆)\n",
+ "\n",
+ "Print the obtained value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "nuclear-belief",
+ "metadata": {
+ "id": "nuclear-belief"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "concerned-radical",
+ "metadata": {
+ "id": "concerned-radical"
+ },
+ "source": [
+ "#### Exercise 08. Plot the histogram of prices for the population (level5 column) of \"Arroyomolinos (Madrid)\" and explain what you observe (★★☆)\n",
+ "\n",
+ "Print the histogram of the prices and write in the Markdown cell a brief analysis about the plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "sudden-message",
+ "metadata": {
+ "id": "sudden-message"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO: Code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "impressed-combination",
+ "metadata": {
+ "id": "impressed-combination"
+ },
+ "source": [
+ "**TODO: Markdown**. To write here, double-click on this cell, remove this content and place the text you want to write. Then, execute the cell."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "actual-edinburgh",
+ "metadata": {
+ "id": "actual-edinburgh"
+ },
+ "source": [
+ "#### Exercise 09. Are the average prices of \"Valdemorillo\" and \"Galapagar\" the same? (★★☆)\n",
+ "\n",
+ "Print both average prices and then write a conclusion about them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "numeric-commerce",
+ "metadata": {
+ "id": "numeric-commerce"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "lonely-article",
+ "metadata": {
+ "id": "lonely-article"
+ },
+ "source": [
+ "#### Exercise 10. Are the average prices per square meter (price/m2) of \"Valdemorillo\" and \"Galapagar\" the same? (★★☆)\n",
+ "\n",
+ "Print both average prices and then write a conclusion about it.\n",
+ "\n",
+ "Hint: Create a new column called `pps` (price per square meter) and then analyze the values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "hourly-globe",
+ "metadata": {
+ "id": "hourly-globe"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "pleasant-invite",
+ "metadata": {
+ "id": "pleasant-invite"
+ },
+ "source": [
+ "#### Exercise 11. Analyze the relation between the surface and the price of the houses (★★☆)\n",
+ "\n",
+ "Hint: You can make a `scatter plot`, then write a conclusion about it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "common-drilling",
+ "metadata": {
+ "id": "common-drilling"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO: Code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ahead-liquid",
+ "metadata": {
+ "id": "ahead-liquid"
+ },
+ "source": [
+ "**TODO: Markdown**. To write here, double-click on this cell, remove this content and place the text you want to write. Then, execute the cell."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "coordinate-sunrise",
+ "metadata": {
+ "id": "coordinate-sunrise"
+ },
+ "source": [
+ "#### Exercise 12. How many real estate agencies does the dataset contain? (★★☆)\n",
+ "\n",
+ "Print the obtained value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "valid-honolulu",
+ "metadata": {
+ "id": "valid-honolulu"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "binding-ebony",
+ "metadata": {
+ "id": "binding-ebony"
+ },
+ "source": [
+ "#### Exercise 13. Which is the population (level5 column) that contains the most houses? (★★☆)\n",
+ "\n",
+ "Print both the population and the number of houses."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "static-perry",
+ "metadata": {
+ "id": "static-perry"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "entire-classification",
+ "metadata": {
+ "id": "entire-classification"
+ },
+ "source": [
+ "#### Exercise 14. Now let's work with the \"south belt\" of Madrid. Make a subset of the original DataFrame that contains the following populations (level5 column): \"Fuenlabrada\", \"Leganés\", \"Getafe\", \"Alcorcón\" (★★☆)\n",
+ "\n",
+ "Hint: Filter the original DataFrame using the column `level5` and the function `isin`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "binary-input",
+ "metadata": {
+ "id": "binary-input"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "severe-fisher",
+ "metadata": {
+ "id": "severe-fisher"
+ },
+ "source": [
+ "#### Exercise 15. Make a bar plot of the median of the prices and explain what you observe (you must use the subset obtained in Exercise 14) (★★★)\n",
+ "\n",
+ "Print the bar of the median of the prices and write in the Markdown cell a brief analysis about the plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "lyric-bunch",
+ "metadata": {
+ "id": "lyric-bunch"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO: Code"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "sublime-newspaper",
+ "metadata": {
+ "id": "sublime-newspaper"
+ },
+ "source": [
+ "**TODO: Markdown**. To write here, double-click on this cell, remove this content and place the text you want to write. Then, execute the cell."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "speaking-diamond",
+ "metadata": {
+ "id": "speaking-diamond"
+ },
+ "source": [
+ "#### Exercise 16. Calculate the sample mean and variance of the variables: price, rooms, surface area and bathrooms (you must use the subset obtained in Exercise 14) (★★★)\n",
+ "\n",
+ "Print both values for each variable."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "random-feeling",
+ "metadata": {
+ "id": "random-feeling"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "revolutionary-matrix",
+ "metadata": {
+ "id": "revolutionary-matrix"
+ },
+ "source": [
+ "#### Exercise 17. What is the most expensive house in each population? You must use the subset obtained in Exercise 14 (★★☆)\n",
+ "\n",
+ "Print both the address and the price of the selected house of each population. You can print a DataFrame or a single line for each population."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fifteen-browse",
+ "metadata": {
+ "id": "fifteen-browse"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "activated-knight",
+ "metadata": {
+ "id": "activated-knight"
+ },
+ "source": [
+ "#### Exercise 18. Normalize the variable of prices for each population and plot the 4 histograms in the same plot (you must use the subset obtained in Exercise 14) (★★★)\n",
+ "\n",
+ "For the normalization method, you can use the one you consider; there is not a single correct answer to this question. Print the plot and write in the Markdown cell a brief analysis about the plot.\n",
+ "\n",
+ "Hint: You can help yourself by reviewing the *multihist* demo of Matplotlib."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "civic-meditation",
+ "metadata": {
+ "id": "civic-meditation"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "precise-heavy",
+ "metadata": {
+ "id": "precise-heavy"
+ },
+ "source": [
+ "**TODO: Markdown**. To write here, double-click on this cell, remove this content and place the text you want to write. Then, execute the cell."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "patent-jonathan",
+ "metadata": {
+ "id": "patent-jonathan"
+ },
+ "source": [
+ "#### Exercise 19. What can you say about the price per square meter (price/m2) between the towns of \"Getafe\" and \"Alcorcón\"? You must use the subset obtained in Exercise 14 (★★☆)\n",
+ "\n",
+ "Hint: Create a new column called `pps` (price per square meter) and then analyze the values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "initial-liverpool",
+ "metadata": {
+ "id": "initial-liverpool"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "enhanced-moscow",
+ "metadata": {
+ "id": "enhanced-moscow"
+ },
+ "source": [
+ "#### Exercise 20. Make the same plot for 4 different populations (level5 column) and rearrange them on the same graph. You must use the subset obtained in Exercise 14 (★★☆)\n",
+ "\n",
+ "Hint: Make a scatter plot of each population using subplots."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "accepting-airfare",
+ "metadata": {
+ "id": "accepting-airfare"
+ },
+ "outputs": [],
+ "source": [
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "blocked-effects",
+ "metadata": {
+ "id": "blocked-effects"
+ },
+ "source": [
+ "#### Exercise 21. Make a plot of the coordinates (latitude and longitude columns) of the south belt of Madrid by color of each population (you must use the subset obtained in Exercise 14) (★★★★)\n",
+ "\n",
+ "Execute the following cell, and then start coding in the next one. You must implement a simple code that transforms the coordinates columns in a Python dictionary (add more information if needed) and then add it to the map"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "headed-privacy",
+ "metadata": {
+ "id": "headed-privacy"
+ },
+ "outputs": [],
+ "source": [
+ "from ipyleaflet import Map, basemaps\n",
+ "\n",
+ "# Map centered on (60 degrees latitude and -2.2 degrees longitude)\n",
+ "# Latitude, longitude\n",
+ "map = Map(center = (60, -2.2), zoom = 2, min_zoom = 1, max_zoom = 20,\n",
+ " basemap=basemaps.Stamen.Terrain)\n",
+ "map"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "present-mistress",
+ "metadata": {
+ "id": "present-mistress"
+ },
+ "outputs": [],
+ "source": [
+ "## HERE: plot the coordinates of the estates\n",
+ "\n",
+ "## PUT HERE YOUR CODE:\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.4"
+ },
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/normalized price histogram south belt madrid r ugalde.png b/normalized price histogram south belt madrid r ugalde.png
new file mode 100644
index 000000000..f4e8f1098
Binary files /dev/null and b/normalized price histogram south belt madrid r ugalde.png differ
diff --git a/price per sqr meter Getafe Alcorcon R. Ugalde 211124.png b/price per sqr meter Getafe Alcorcon R. Ugalde 211124.png
new file mode 100644
index 000000000..3cd19e12f
Binary files /dev/null and b/price per sqr meter Getafe Alcorcon R. Ugalde 211124.png differ
diff --git a/real estate clean up Rober Ugalde.py b/real estate clean up Rober Ugalde.py
new file mode 100644
index 000000000..f21771008
--- /dev/null
+++ b/real estate clean up Rober Ugalde.py
@@ -0,0 +1,711 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Nov 21 06:24:10 2024
+
+@author: rober ugalde
+"""
+
+import pandas as pd
+
+# This CSV file contains semicolons instead of comas as separator
+ds = pd.read_csv('C:/Users/rober ugalde/Documents/real_estate.csv', sep=';')
+ds
+
+
+#Exercise 01. Which is the most expensive house in the dataset? (★☆☆)
+#Print the address and the price of the selected house. For example:
+#The house with address General Street Nº5 is the most
+#expensive and its price is 5000000 USD
+
+
+
+# Find the most expensive house
+most_expensive = ds.loc[ds['price'].idxmax()] # Assuming 'price' is the column with house prices
+
+# Display the details of the most expensive house
+address = most_expensive['address'] # Assuming 'address' is the column for house addresses
+price = most_expensive['price']
+
+print(f"The house with address {address} is the most expensive and its price is {price} USD.")
+
+
+
+#Exercise 02. Which is the cheapest house in the dataset? (★☆☆)
+#Print the address and the price of the selected house. For example:
+
+#The house with address Concrete Street Nº1 is the cheapest and its price is 12000 USD
+
+
+# Find the cheapest house
+cheapest_house = ds.loc[ds['price'].idxmin()] # Assuming 'price' is the column with house prices
+
+# Extract details of the cheapest house
+address = cheapest_house['address'] # Assuming 'address' is the column for house addresses
+price = cheapest_house['price']
+
+# Print the result
+print(f"The house with address {address} is the cheapest and its price is {price} USD.")
+
+
+
+
+
+#Exercise 03. Which is the biggest and the smallest house in the dataset? (★☆☆)
+#Print both the address and the surface of the selected houses. For example:
+
+#The biggest house is located on Yukka Street Nº10 and its surface is 5000 meters
+
+#The smallest house is located on County Road 1 N and its surface is 200 meters
+
+
+
+
+
+
+
+
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Find the biggest house
+biggest_house = ds.loc[ds['surface'].idxmax()] # Assuming 'surface' is the column for house surface areas
+biggest_address = biggest_house['address'] # Assuming 'address' is the column for house addresses
+biggest_surface = biggest_house['surface']
+
+# Find the smallest house
+smallest_house = ds.loc[ds['surface'].idxmin()]
+smallest_address = smallest_house['address']
+smallest_surface = smallest_house['surface']
+
+# Print the results
+print(f"The biggest house is located on {biggest_address} and its surface is {biggest_surface} meters.")
+print(f"The smallest house is located on {smallest_address} and its surface is {smallest_surface} meters.")
+
+
+
+#Exercise 04. How many populations (level5 column) the dataset contains? (★☆☆)
+#Print the names of the populations with a comma as a separator. For example:
+
+#> print(populations)
+
+#population1, population2, population3, ...
+
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Get unique populations from the 'level5' column
+populations = ds['level5'].unique() # Assuming 'level5' is the column for populations
+
+# Convert the array to a comma-separated string
+populations_list = ', '.join(populations)
+
+# Print the names of the populations
+print(f"Populations: {populations_list}")
+
+# Print the total number of unique populations
+print(f"Number of populations: {len(populations)}")
+
+
+#Exercise 05. Does the dataset contain NAs? (★☆☆)
+#Print a boolean value (True or False) followed by the rows/cols that contains NAs.
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Check if the dataset contains any NAs
+contains_nas = ds.isnull().values.any()
+
+# Print whether the dataset contains NAs
+print(f"Contains NAs: {contains_nas}")
+
+if contains_nas:
+ # Print rows with NAs
+ rows_with_nas = ds[ds.isnull().any(axis=1)]
+ print("\nRows with NAs:")
+ print(rows_with_nas)
+
+ # Print columns with NAs
+ cols_with_nas = ds.columns[ds.isnull().any()].tolist()
+ print("\nColumns with NAs:")
+ print(cols_with_nas)
+else:
+ print("The dataset does not contain any NAs.")
+
+
+
+
+#Exercise 06. Delete the NAs of the dataset, if applicable (★★☆)
+#Print a comparison between the dimensions of the original DataFrame versus the DataFrame after the deletions.
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Store original dimensions
+original_shape = ds.shape
+
+# Remove rows with NAs
+ds_cleaned = ds.dropna()
+
+# Store cleaned dimensions
+cleaned_shape = ds_cleaned.shape
+
+# Print comparison of dimensions
+print(f"Original DataFrame dimensions: {original_shape}")
+print(f"DataFrame dimensions after removing NAs: {cleaned_shape}")
+
+# Check how many rows were removed
+rows_removed = original_shape[0] - cleaned_shape[0]
+print(f"Number of rows removed: {rows_removed}")
+
+
+
+#Exercise 07. Which is the mean of prices in the population (level5 column) of "Arroyomolinos (Madrid)"? (★★☆)
+#Print the obtained value.
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Filter the dataset for the population "Arroyomolinos (Madrid)"
+arroyomolinos_data = ds[ds['level5'] == 'Arroyomolinos (Madrid)'] # Assuming 'level5' is the population column
+
+# Calculate the mean price
+mean_price = arroyomolinos_data['price'].mean() # Assuming 'price' is the price column
+
+# Print the mean price
+print(f"The mean price of houses in 'Arroyomolinos (Madrid)' is {mean_price:.2f} USD.")
+
+
+
+
+#Exercise 08. Plot the histogram of prices for the population (level5 column) of "Arroyomolinos (Madrid)" and explain what you observe (★★☆)
+#Print the histogram of the prices and write in the Markdown cell a brief analysis about the plot.
+
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Filter the dataset for the population "Arroyomolinos (Madrid)"
+arroyomolinos_data = ds[ds['level5'] == 'Arroyomolinos (Madrid)'] # Assuming 'level5' is the population column
+
+# Extract prices for the filtered population
+prices = arroyomolinos_data['price'] # Assuming 'price' is the price column
+
+# Plot the histogram
+plt.figure(figsize=(10, 6))
+plt.hist(prices, bins=10, color='skyblue', alpha=0.8, edgecolor='black')
+plt.title("Histogram of Prices in 'Arroyomolinos (Madrid)'", fontsize=16)
+plt.xlabel("Price (USD)", fontsize=14)
+plt.ylabel("Frequency", fontsize=14)
+plt.grid(axis='y', linestyle='--', alpha=0.7)
+plt.show()
+Observations and Analysis:
+Price Distribution:
+
+The histogram will show the spread of property prices
+in "Arroyomolinos (Madrid)." For example:
+If the bars are concentrated around a specific range,
+ it suggests most properties are priced similarly.
+A wider spread indicates diverse pricing.
+Peaks:
+
+Peaks (tall bars) in the histogram represent price ranges
+with a high number of properties.
+A peak at a lower price range might suggest affordability in that area.
+
+Outliers:
+
+If there are bars far away from the rest of the distribution,
+ it may indicate outliers, such as luxury home
+
+
+
+ #Exercise 09. Are the average prices of "Valdemorillo" and "Galapagar" the same? (★★☆)
+#Print both average prices and then write a conclusion about them.
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Filter data for "Valdemorillo" and "Galapagar"
+valdemorillo_data = ds[ds['level5'] == 'Valdemorillo'] # Assuming 'level5' is the population column
+galapagar_data = ds[ds['level5'] == 'Galapagar']
+
+# Calculate the average prices
+valdemorillo_avg_price = valdemorillo_data['price'].mean() # Assuming 'price' is the price column
+galapagar_avg_price = galapagar_data['price'].mean()
+
+# Print the results
+print(f"Average price in Valdemorillo: {valdemorillo_avg_price:.2f} USD")
+print(f"Average price in Galapagar: {galapagar_avg_price:.2f} USD")
+
+# Compare and conclude
+if valdemorillo_avg_price == galapagar_avg_price:
+ print("The average prices in Valdemorillo and Galapagar are the same.")
+else:
+ print("The average prices in Valdemorillo and Galapagar are different.")
+
+
+#Exercise 10. Are the average prices per square meter (price/m2) of "Valdemorillo" and "Galapagar" the same? (★★☆)
+#Print both average prices and then write a conclusion about it.
+
+#Hint: Create a new column called pps (price per square meter) and
+#then analyze the values.
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Create a new column for price per square meter (pps)
+ds['pps'] = ds['price'] / ds['surface'] # Assuming 'price' and 'surface' columns exist
+
+# Filter data for "Valdemorillo" and "Galapagar"
+valdemorillo_data = ds[ds['level5'] == 'Valdemorillo'] # Assuming 'level5' is the population column
+galapagar_data = ds[ds['level5'] == 'Galapagar']
+
+# Calculate the average price per square meter for both populations
+valdemorillo_avg_pps = valdemorillo_data['pps'].mean()
+galapagar_avg_pps = galapagar_data['pps'].mean()
+
+# Print the results
+print(f"Average price per square meter in Valdemorillo: {valdemorillo_avg_pps:.2f} USD/m²")
+print(f"Average price per square meter in Galapagar: {galapagar_avg_pps:.2f} USD/m²")
+
+# Compare and conclude
+if valdemorillo_avg_pps == galapagar_avg_pps:
+ print("The average price per square meter in Valdemorillo and Galapagar is the same.")
+else:
+ print("The average price per square meter in Valdemorillo and Galapagar is different.")
+
+
+
+#Exercise 11. Analyze the relation between the surface and the price of the houses (★★☆)
+#Hint: You can make a scatter plot, then write a conclusion about it.
+
+import matplotlib.pyplot as plt
+
+# Scatter plot for surface vs. price
+plt.figure(figsize=(10, 6))
+plt.scatter(ds['surface'], ds['price'], alpha=0.7, c='blue', edgecolor='black')
+plt.title("Scatter Plot: Surface vs. Price of Houses", fontsize=16)
+plt.xlabel("Surface (m²)", fontsize=14)
+plt.ylabel("Price (USD)", fontsize=14)
+plt.grid(True, linestyle='--', alpha=0.7)
+plt.show()
+
+#then write a conclusion about it.
+concluded that there is one property measuring 25e4 m2 at a very
+affordable price 3.2e5 usd.
+there are other two in this condition, but measure 6.67e4 at 4.4e5 usd.
+which makes even more attypical the first conclusion.
+the last outlier is at 1.68e4 m2 and 1.3e5 usd
+
+
+
+#Exercise 12. How many real estate agencies does the dataset contain? (★★☆)
+#Print the obtained value.
+
+
+import pandas as pd
+
+# Count the number of unique real estate agencies
+unique_agencies = ds['realEstate_name'].nunique() # 'realEstate_name' is the correct column for agency names
+
+# Print the result
+print(f"The dataset contains {unique_agencies} real estate agencies.")
+
+
+
+
+#Exercise 13. Which is the population (level5 column) that contains
+# the most houses? (★★☆)
+#Print both the population and the number of houses.
+
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Find the population with the most houses
+population_with_most_houses = ds['level5'].value_counts().idxmax() # Population with the most houses
+number_of_houses = ds['level5'].value_counts().max() # Number of houses in that population
+
+# Print the result
+print(f"The population with the most houses is '{population_with_most_houses}' with {number_of_houses} houses.")
+
+
+
+
+#Exercise 14. Now let's work with the "south belt" of Madrid. Make a subset of the original DataFrame that contains the following populations (level5 column): "Fuenlabrada", "Leganés", "Getafe", "Alcorcón" (★★☆)
+#Hint: Filter the original DataFrame using the column level5 and the function isin.
+
+
+
+import pandas as pd
+
+# Load the dataset
+file_path = r'C:\Users\rober ugalde\Documents\real_estate.csv'
+ds = pd.read_csv(file_path, sep=';')
+
+# Define the "south belt" populations
+south_belt_populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+
+# Filter the DataFrame using the level5 column and isin
+south_belt_df = ds[ds['level5'].isin(south_belt_populations)]
+
+# Print the first few rows of the subset to verify
+print(south_belt_df.head())
+
+# Print the dimensions of the subset
+print(f"The south belt DataFrame contains {south_belt_df.shape[0]} rows and {south_belt_df.shape[1]} columns.")
+
+
+
+#Exercise 15. Make a bar plot of the median of the prices and explain what you observe (you must use the subset obtained in Exercise 14) (★★★)
+#Print the bar of the median of the prices
+and write in the Markdown cell a brief analysis about the plot.
+
+
+# Define the "south belt" populations
+south_belt_populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+
+# Filter the DataFrame using the level5 column and isin
+south_belt_df = ds[ds['level5'].isin(south_belt_populations)]
+
+
+# Calculate the median prices for each population in the south belt
+median_prices = south_belt_df.groupby('level5')['price'].median()
+
+# Plot the bar chart
+plt.figure(figsize=(10, 6))
+median_prices.plot(kind='bar', color='skyblue', edgecolor='black')
+plt.title("Median Prices in the South Belt of Madrid", fontsize=16)
+plt.xlabel("Population", fontsize=14)
+plt.ylabel("Median Price (USD)", fontsize=14)
+plt.xticks(rotation=45)
+plt.grid(axis='y', linestyle='--', alpha=0.7)
+plt.tight_layout()
+
+# Show the plot
+plt.show()
+
+
+and write in the Markdown cell a brief analysis about the plot.
+# write about the plot:
+
+
+ We can see that the Getafe Population is the Highest median price
+ to up 2.859e5 usd. Which is really much higher than its closest
+ Median price at Alcorcon, way lower at 1.789e5. or even Leganes,
+ at 1.700e5.
+
+
+#Exercise 16. Calculate the sample mean and variance of the variables: price, rooms, surface area and bathrooms (you must use the subset obtained in Exercise 14) (★★★)
+#Print both values for each variable
+
+
+# Define the "south belt" populations
+south_belt_populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+
+# Filter the DataFrame using the level5 column and isin
+south_belt_df = ds[ds['level5'].isin(south_belt_populations)]
+
+# Calculate the sample mean and variance for the specified variables
+variables = ['price', 'rooms', 'surface', 'bathrooms']
+
+# Compute the sample mean and variance
+results = {}
+for var in variables:
+ mean = south_belt_df[var].mean()
+ variance = south_belt_df[var].var()
+ results[var] = {'mean': mean, 'variance': variance}
+
+# Print the results
+for var, stats in results.items():
+ print(f"Variable: {var}")
+ print(f" Mean: {stats['mean']:.2f}")
+ print(f" Variance: {stats['variance']:.2f}\n")
+
+
+
+
+#Exercise 17. What is the most expensive house in each population? You must use the subset obtained in Exercise 14 (★★☆)
+#Print both the address and the price of the selected house of each population. You can print a DataFrame or a single line for each population.
+
+
+# Find the most expensive house in each population
+most_expensive_houses = south_belt_df.loc[south_belt_df.groupby('level5')['price'].idxmax()]
+
+# Select relevant columns to display
+result = most_expensive_houses[['level5', 'address', 'price']]
+
+# Print the results as a DataFrame
+print(result)
+
+# Print as individual lines
+for _, row in result.iterrows():
+ print(f"In {row['level5']}, the most expensive house is located at {row['address']} with a price of {row['price']} USD.")
+
+
+
+
+#Exercise 18. Normalize the variable of prices for each population and
+#plot the 4 histograms in the same plot (you must use the subset obtained in Exercise 14) (★★★)
+#For the normalization method, you can use the one you consider;
+there is not a single correct answer to this question.
+Print the plot and write in the Markdown cell a
+brief analysis about the plot.
+
+
+
+
+
+import matplotlib.pyplot as plt
+
+# Normalize the prices for each population
+south_belt_df['normalized_price'] = south_belt_df.groupby('level5')['price'].transform(
+ lambda x: (x - x.mean()) / x.std()
+)
+
+# Prepare the data for plotting
+populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+colors = ['blue', 'green', 'orange', 'red']
+plt.figure(figsize=(10, 6))
+
+# Plot histograms for each population
+for population, color in zip(populations, colors):
+ subset = south_belt_df[south_belt_df['level5'] == population]
+ plt.hist(
+ subset['normalized_price'],
+ bins=10,
+ alpha=0.5,
+ label=population,
+ color=color,
+ edgecolor='black'
+ )
+
+# Customize the plot
+plt.title("Normalized Price Histograms for the South Belt of Madrid", fontsize=16)
+plt.xlabel("Normalized Price", fontsize=14)
+plt.ylabel("Frequency", fontsize=14)
+plt.legend(title="Population")
+plt.grid(axis='y', linestyle='--', alpha=0.7)
+plt.tight_layout()
+
+# Show the plot
+plt.show()
+
+
+#Print the plot and write in the Markdown cell a
+#brief analysis about the plot.
+
+Leganés and Alcorcón exhibit more consistent pricing patterns.
+Fuenlabrada and Getafe show greater variability, likely reflecting a mix of affordable and higher-priced houses.
+Spread:
+
+Some populations (e.g., Getafe) have a wider spread, indicating more variability in housing prices.
+Other populations (e.g., Alcorcón) appear more compact, suggesting a more uniform pricing structure.
+
+Outliers:
+
+The presence of bars extending far to the right (e.g., near 4 or 5) may indicate extremely expensive houses relative to the population average.
+
+
+
+
+# Exercise 19. What can you say about the price per square meter (price/m2) between the towns of "Getafe" and "Alcorcón"? You must use the subset obtained in Exercise 14 (★★☆)
+#Hint: Create a new column called pps (price per square meter) and then analyze the values.
+
+
+# Create a new column for price per square meter (pps)
+south_belt_df['pps'] = south_belt_df['price'] / south_belt_df['surface']
+
+# Filter data for "Getafe" and "Alcorcón"
+getafe_data = south_belt_df[south_belt_df['level5'] == 'Getafe']
+alcorcon_data = south_belt_df[south_belt_df['level5'] == 'Alcorcón']
+
+# Calculate summary statistics for price per square meter (pps)
+getafe_pps_mean = getafe_data['pps'].mean()
+getafe_pps_median = getafe_data['pps'].median()
+alcorcon_pps_mean = alcorcon_data['pps'].mean()
+alcorcon_pps_median = alcorcon_data['pps'].median()
+
+# Print the results
+print(f"Getafe: Mean PPS = {getafe_pps_mean:.2f}, Median PPS = {getafe_pps_median:.2f}")
+print(f"Alcorcón: Mean PPS = {alcorcon_pps_mean:.2f}, Median PPS = {alcorcon_pps_median:.2f}")
+
+# Plot histograms for PPS comparison
+plt.figure(figsize=(10, 6))
+plt.hist(getafe_data['pps'], bins=10, alpha=0.5, label='Getafe', color='blue', edgecolor='black')
+plt.hist(alcorcon_data['pps'], bins=10, alpha=0.5, label='Alcorcón', color='orange', edgecolor='black')
+plt.title("Comparison of Price per Square Meter (PPS) Between Getafe and Alcorcón", fontsize=16)
+plt.xlabel("Price per Square Meter (USD/m²)", fontsize=14)
+plt.ylabel("Frequency", fontsize=14)
+plt.legend(title="Population")
+plt.grid(axis='y', linestyle='--', alpha=0.7)
+plt.tight_layout()
+
+# Show the plot
+plt.show()
+
+#what can I say
+
+
+The Getafe PPSQM HITS a max at 81 units with average PPSQM 1913 usd/m2
+
+
+
+#Exercise 20. Make the same plot for 4 different populations (level5 column)
+#and rearrange them on the same graph. You must use the subset obtained in Exercise 14 (★★☆)
+#Hint: Make a scatter plot of each population using subplots.
+
+
+# Define the populations to plot
+populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+
+# Create a figure with subplots for each population
+fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+axes = axes.flatten()
+
+# Scatter plot for each population
+for i, population in enumerate(populations):
+ subset = south_belt_df[south_belt_df['level5'] == population]
+ axes[i].scatter(subset['surface'], subset['price'], alpha=0.7, edgecolor='black', label=population)
+ axes[i].set_title(f"{population}", fontsize=14)
+ axes[i].set_xlabel("Surface (m²)", fontsize=12)
+ axes[i].set_ylabel("Price (USD)", fontsize=12)
+ axes[i].grid(True, linestyle='--', alpha=0.7)
+ axes[i].legend()
+
+# Adjust layout
+plt.tight_layout()
+plt.suptitle("Scatter Plots of Surface vs Price for South Belt Populations", fontsize=16, y=1.02)
+
+# Show the plot
+plt.show()
+
+
+
+#Exercise 21. Make a plot of the coordinates (latitude and longitude columns) of the south belt of Madrid by color of each population (you must use the subset obtained in Exercise 14) (★★★★)
+#Execute the following cell, and then start coding in the next one. You must implement a simple code that transforms the coordinates columns in a Python dictionary (add more information if needed) and then add it to the map
+
+
+
+from ipyleaflet import Map, basemaps
+
+# Map centered on (60 degrees latitude and -2.2 degrees longitude)
+# Latitude, longitude
+map = Map(center = (60, -2.2), zoom = 2, min_zoom = 1, max_zoom = 20,
+ basemap=basemaps.Stamen.Terrain)
+map
+## HERE: plot the coordinates of the estates
+
+from ipyleaflet import Map, Marker, basemaps, Icon
+
+# Map centered on the south belt of Madrid
+map = Map(center=(40.35, -3.75), zoom=12, basemap=basemaps.Stamen.Terrain)
+
+# Define colors for each population
+population_colors = {
+ "Fuenlabrada": "blue",
+ "Leganés": "green",
+ "Getafe": "orange",
+ "Alcorcón": "red"
+}
+
+# Add markers for each estate in the subset
+for _, row in south_belt_df.iterrows():
+ latitude = row['latitude']
+ longitude = row['longitude']
+ population = row['level5']
+ color = population_colors.get(population, "black") # Default to black if population not found
+
+ # Create and add a marker
+ marker = Marker(location=(latitude, longitude), icon=Icon(color=color), draggable=False)
+ map.add_layer(marker)
+
+# Display the map
+map
+
+
+
+or:
+
+import pandas as pd
+
+# This CSV file contains semicolons instead of comas as separator
+ds = pd.read_csv('C:/Users/rober ugalde/Documents/real_estate.csv', sep=';')
+ds
+
+
+
+
+
+#pip install folium
+
+import folium
+
+# Define the populations in the south belt
+south_belt_populations = ["Fuenlabrada", "Leganés", "Getafe", "Alcorcón"]
+
+# Filter the original DataFrame (assuming the original DataFrame is named 'ds')
+south_belt_df = ds[ds['level5'].isin(south_belt_populations)]
+
+
+
+
+# Create the base map centered on the south belt of Madrid
+map_center = [40.35, -3.75] # Approximate center of the south belt of Madrid
+m = folium.Map(location=map_center, zoom_start=12)
+
+# Define colors for each population
+population_colors = {
+ "Fuenlabrada": "blue",
+ "Leganés": "green",
+ "Getafe": "orange",
+ "Alcorcón": "red"
+}
+
+# Add markers for each estate in the subset
+for _, row in south_belt_df.iterrows():
+ latitude = row['latitude']
+ longitude = row['longitude']
+ population = row['level5']
+ color = population_colors.get(population, "black") # Default to black if population not found
+
+ # Add a marker to the map
+ folium.Marker(
+ location=[latitude, longitude],
+ popup=f"{population}",
+ icon=folium.Icon(color=color)
+ ).add_to(m)
+
+# Display the map
+m.save('south_belt_map.html')
+print("Map saved as 'south_belt_map.html'. Open this file in your browser to view the map.")
diff --git a/surface_price_scatter plot__21nov24.png b/surface_price_scatter plot__21nov24.png
new file mode 100644
index 000000000..8ba6d9f31
Binary files /dev/null and b/surface_price_scatter plot__21nov24.png differ