From a8718ccad62cc084b9224e7b7a16cf29348662d1 Mon Sep 17 00:00:00 2001 From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com> Date: Sat, 18 May 2024 13:19:04 +0300 Subject: [PATCH] [FSTORE-1385][APPEND] Data Loading Optimization (#262) * Data Loading Optimization --- .../1_feature_pipeline.ipynb | 134 +++++++++++------- .../fraud_cheque_detection/config.py | 3 + .../fraud_cheque_detection/functions/utils.py | 40 ++++++ 3 files changed, 125 insertions(+), 52 deletions(-) diff --git a/advanced_tutorials/fraud_cheque_detection/1_feature_pipeline.ipynb b/advanced_tutorials/fraud_cheque_detection/1_feature_pipeline.ipynb index a412a4a6..6ad2ae39 100644 --- a/advanced_tutorials/fraud_cheque_detection/1_feature_pipeline.ipynb +++ b/advanced_tutorials/fraud_cheque_detection/1_feature_pipeline.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "93202236", + "id": "610d4489", "metadata": {}, "source": [ "## 📝 Imports" @@ -11,24 +11,44 @@ { "cell_type": "code", "execution_count": 1, - "id": "712610c8", + "id": "c0bcaa61", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ - "# !pip install -r requirements.txt -q" + "!pip install -r requirements.txt -q" ] }, { "cell_type": "code", "execution_count": 2, - "id": "0edd1da0", + "id": "c5760f50", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2024-05-07 12:50:06,474 INFO: generated new fontManager\n" + ] + } + ], "source": [ + "import config\n", "import pandas as pd\n", + "\n", "from functions.utils import (\n", " load_image,\n", " show_image,\n", + " download_and_extract_zip,\n", ")\n", "from functions.donut import (\n", " load_cheque_parser,\n", @@ -43,7 +63,7 @@ }, { "cell_type": "markdown", - "id": "be633a1e", + "id": "c25e5d83", "metadata": {}, "source": [ "## 🗄️ Data Loading" @@ -51,8 +71,18 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "67366ca2", + "execution_count": null, + "id": "65065778", + "metadata": {}, + "outputs": [], + "source": [ + "download_and_extract_zip(config.DOWNLOAD_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e977adb3", "metadata": {}, "outputs": [ { @@ -168,7 +198,7 @@ "4 2912 26.png axis Clarice Blanc 1 " ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -188,8 +218,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "c1f83970", + "execution_count": 3, + "id": "88c9bf6c", "metadata": {}, "outputs": [ { @@ -209,7 +239,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -220,8 +250,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "19fa12ff", + "execution_count": 4, + "id": "a08ffae4", "metadata": {}, "outputs": [ { @@ -233,7 +263,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -244,7 +274,7 @@ }, { "cell_type": "markdown", - "id": "cd0a4565", + "id": "f5822567", "metadata": {}, "source": [ "## 👨🏻‍🎨 Data Visualization" @@ -252,8 +282,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "17a65a59", + "execution_count": 5, + "id": "7bdbd2f6", "metadata": {}, "outputs": [ { @@ -275,8 +305,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "cfad7f71", + "execution_count": 6, + "id": "9207bb77", "metadata": {}, "outputs": [ { @@ -336,7 +366,7 @@ "0 3755 120.png axis Edmee Pelletier 1 " ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -347,8 +377,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "127e74de", + "execution_count": 7, + "id": "367d510f", "metadata": {}, "outputs": [ { @@ -464,7 +494,7 @@ "504 68.png axis Colette Monjeau 0 " ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -475,8 +505,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "aa6510e8", + "execution_count": 8, + "id": "38e49887", "metadata": {}, "outputs": [ { @@ -497,8 +527,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "8545d92e", + "execution_count": 9, + "id": "9359ff42", "metadata": {}, "outputs": [ { @@ -520,7 +550,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "c2992e1e", + "id": "08848b90", "metadata": {}, "outputs": [ { @@ -541,7 +571,7 @@ }, { "cell_type": "markdown", - "id": "750c5861", + "id": "def821e2", "metadata": {}, "source": [ "## 👩🏻‍🔬 Feature Engineering \n" @@ -549,7 +579,7 @@ }, { "cell_type": "markdown", - "id": "b8f72b96", + "id": "f63c2ba5", "metadata": {}, "source": [ "### ⛳️ Spell Check \n" @@ -558,7 +588,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "8852eaa8", + "id": "0c92a0c0", "metadata": {}, "outputs": [ { @@ -579,7 +609,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "a0758b6e", + "id": "4f32a09d", "metadata": {}, "outputs": [ { @@ -600,7 +630,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "8c888d66", + "id": "aa6e373f", "metadata": {}, "outputs": [ { @@ -621,7 +651,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "aec3c28e", + "id": "64a54945", "metadata": {}, "outputs": [ { @@ -642,7 +672,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "6a914a54", + "id": "9dc26883", "metadata": {}, "outputs": [ { @@ -663,7 +693,7 @@ { "cell_type": "code", "execution_count": 17, - "id": "75d4f920", + "id": "6c1699fc", "metadata": {}, "outputs": [ { @@ -779,7 +809,7 @@ { "cell_type": "code", "execution_count": 18, - "id": "49b32414", + "id": "4362adec", "metadata": {}, "outputs": [ { @@ -891,7 +921,7 @@ }, { "cell_type": "markdown", - "id": "b26f754a", + "id": "7798f0c5", "metadata": {}, "source": [ "### ⛳️ Amount in Letter and Number Match \n" @@ -900,7 +930,7 @@ { "cell_type": "code", "execution_count": 19, - "id": "061e85f3", + "id": "253c16d8", "metadata": {}, "outputs": [ { @@ -921,7 +951,7 @@ { "cell_type": "code", "execution_count": 20, - "id": "c457f090", + "id": "17e7799d", "metadata": {}, "outputs": [ { @@ -942,7 +972,7 @@ { "cell_type": "code", "execution_count": 21, - "id": "3a7d873e", + "id": "a268a403", "metadata": {}, "outputs": [ { @@ -963,7 +993,7 @@ { "cell_type": "code", "execution_count": 22, - "id": "a8c3fce2", + "id": "771196e4", "metadata": {}, "outputs": [ { @@ -984,7 +1014,7 @@ { "cell_type": "code", "execution_count": 23, - "id": "fd965acb", + "id": "151b4791", "metadata": {}, "outputs": [ { @@ -1110,7 +1140,7 @@ { "cell_type": "code", "execution_count": 24, - "id": "2e25d5a1", + "id": "d7352554", "metadata": {}, "outputs": [ { @@ -1231,7 +1261,7 @@ { "cell_type": "code", "execution_count": 25, - "id": "b5dab7e4", + "id": "fca40774", "metadata": {}, "outputs": [ { @@ -1254,7 +1284,7 @@ }, { "cell_type": "markdown", - "id": "f1930485", + "id": "1037b491", "metadata": {}, "source": [ "## 🔮 Connecting to Hopsworks Feature Store " @@ -1263,7 +1293,7 @@ { "cell_type": "code", "execution_count": 26, - "id": "d6386c67", + "id": "57b46c7b", "metadata": {}, "outputs": [ { @@ -1287,7 +1317,7 @@ }, { "cell_type": "markdown", - "id": "b6df04ec", + "id": "c46a9f7b", "metadata": {}, "source": [ "## 🪄 Feature Group Creation " @@ -1296,7 +1326,7 @@ { "cell_type": "code", "execution_count": 27, - "id": "c1cb8ad0", + "id": "84f64635", "metadata": {}, "outputs": [ { @@ -1347,7 +1377,7 @@ }, { "cell_type": "markdown", - "id": "998001fe", + "id": "d70a5d38", "metadata": {}, "source": [ "---" diff --git a/advanced_tutorials/fraud_cheque_detection/config.py b/advanced_tutorials/fraud_cheque_detection/config.py index 6c57dcdb..d13f6e1a 100644 --- a/advanced_tutorials/fraud_cheque_detection/config.py +++ b/advanced_tutorials/fraud_cheque_detection/config.py @@ -1,6 +1,9 @@ ### Dataset Configuration DATASET_NAME = "shivi/cheques_sample_data" +### Cheque Data URL +DOWNLOAD_URL = "https://repo.hops.works/dev/jdowling/cheque-fraud.zip" + ### Donut Configuration DONUT_BASE_REPO = "naver-clova-ix/donut-base" #"nielsr/donut-base" DONUT_FT_REPO = "shivi/donut-cheque-parser" diff --git a/advanced_tutorials/fraud_cheque_detection/functions/utils.py b/advanced_tutorials/fraud_cheque_detection/functions/utils.py index 9f00f123..90b2f9f1 100644 --- a/advanced_tutorials/fraud_cheque_detection/functions/utils.py +++ b/advanced_tutorials/fraud_cheque_detection/functions/utils.py @@ -1,6 +1,10 @@ import matplotlib.pyplot as plt from PIL import Image import numpy as np +import requests +import zipfile +import os + def load_image(image_name, folder_name='data/images/'): """ @@ -17,6 +21,7 @@ def load_image(image_name, folder_name='data/images/'): image = Image.open(folder_name + image_name).convert('RGB') return image + def show_image(image): """ Displays an image using matplotlib with specific figure sizing and no axis. @@ -37,3 +42,38 @@ def show_image(image): plt.axis('off') # Show the plot plt.show() + + +def download_and_extract_zip(url: str, extract_to: str = '.') -> None: + """ + Download a .zip file from a given URL and extract it into the specified directory. + + This function handles the downloading and extraction of a zip file from a specified URL. + It saves the zip file to the local directory, extracts it to a target directory, and handles + basic HTTP status checks to ensure the file is downloaded correctly. + + Args: + url (str): The URL of the .zip file to download. + extract_to (str): Directory to extract the files into, defaults to the current directory. + + Raises: + Exception: Raises an exception if the file could not be downloaded (non-200 status code). + """ + # Get the filename from the URL by splitting it and getting the last segment + filename = url.split('/')[-1] + + # Attempt to download the file with HTTP GET request + response = requests.get(url, stream=True) + if response.status_code == 200: + # Write the response content to a file in binary write mode + with open(filename, 'wb') as f: + f.write(response.content) + print(f"Downloaded {filename}") + + # Extract the .zip file using the ZipFile class + with zipfile.ZipFile(filename, 'r') as zip_ref: + zip_ref.extractall(extract_to) + print(f"Extracted {filename} to {extract_to}") + else: + # If the download fails, raise an exception with the status code + raise Exception(f"❌ Failed to download the file. Status code: {response.status_code}")