Skip to content

Commit

Permalink
Prépare les données
Browse files Browse the repository at this point in the history
  • Loading branch information
linogaliana committed Dec 20, 2024
1 parent 17b7097 commit 0051f7a
Showing 1 changed file with 151 additions and 0 deletions.
151 changes: 151 additions & 0 deletions applications/data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Conversion des bases en geoparquet\n",
"\n",
"## 1. La base dvf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import os\n",
"\n",
"url = \"https://files.data.gouv.fr/geo-dvf/latest/csv/2022/full.csv.gz\"\n",
"file_name = \"dvf.csv.gz\"\n",
"\n",
"# Check if the file already exists\n",
"if not os.path.exists(file_name):\n",
" response = requests.get(url)\n",
"\n",
" if response.status_code == 200:\n",
" with open(file_name, \"wb\") as f:\n",
" f.write(response.content)\n",
" print(\"Téléchargement réussi.\")\n",
" else:\n",
" print(f\"Échec du téléchargement. Code d'état : {response.status_code}\")\n",
"else:\n",
" print(f\"Le fichier '{file_name}' existe déjà. Aucun téléchargement nécessaire.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import geopandas as gpd\n",
"dvf = pd.read_csv(\"dvf.csv.gz\", dtype={'code_commune': \"str\", \"code_departement\": \"str\"})\n",
"gdf = gpd.GeoDataFrame(\n",
" dvf,\n",
" geometry=gpd.points_from_xy(x=dvf.longitude, y=dvf.latitude)\n",
")\n",
"gdf.set_crs(epsg=4326)\n",
"object_cols = gdf.select_dtypes(['object']).columns\n",
"gdf[object_cols] = gdf[object_cols].astype('string')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"gdf.to_parquet(\"dvf.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import duckdb\n",
"duckdb.execute(\"INSTALL spatial;\")\n",
"duckdb.execute(\"LOAD spatial;\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duckdb.sql(\"SELECT * FROM read_parquet('dvf.parquet')\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"reference_lon = 2.3602 # Replace with your reference longitude\n",
"reference_lat = 48.9245 # Replace with your reference latitude\n",
"\n",
"# Approximate 1 km in degrees (0.009 degrees latitude/longitude)\n",
"buffer_distance_deg = 0.009\n",
"\n",
"# Updated query with approximate filtering using bounding box method\n",
"toto = duckdb.sql(\n",
" f\"\"\"\n",
" SELECT *\n",
" FROM read_parquet('dvf.parquet')\n",
" WHERE\n",
" bbox.xmin BETWEEN {reference_lon - buffer_distance_deg} AND {reference_lon + buffer_distance_deg}\n",
" AND bbox.ymin BETWEEN {reference_lat - buffer_distance_deg} AND {reference_lat + buffer_distance_deg}\n",
" \"\"\"\n",
").to_df()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"duckdb.sql(f'CREATE OR REPLACE VIEW dvf AS SELECT * FROM read_parquet(\"dvf.parquet\")')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"duckdb.sql(\"SELECT * FROM dvf LIMIT 5\").df().head(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 0051f7a

Please sign in to comment.