Skip to content

Commit

Permalink
add import script
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastienReuiller committed Oct 18, 2023
1 parent aa4bba1 commit 41a612b
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions lemarche/siaes/management/commands/import_esat_from_asp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import csv
import os
import time

import openai
from django.conf import settings
from django.core.management.base import BaseCommand

from lemarche.siaes import constants as siae_constants
from lemarche.siaes.models import Siae
from lemarche.utils.apis.api_entreprise import etablissement_get_or_error # exercice_get_or_error
from lemarche.utils.apis.geocoding import get_geocoding_data
from lemarche.utils.constants import DEPARTMENT_TO_REGION, department_from_postcode
from lemarche.utils.data import rename_dict_key


FILE_NAME = "Annuaire_ESAT_20230717.csv"
FILE_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + FILE_NAME
FIELD_NAME_LIST = [
"Département",
"Dénomination",
"N° de Siret",
"Adresse",
]

# openai client configuration
openai.organization = settings.OPENAI_ORG
openai.api_base = settings.OPENAI_API_BASE
openai.api_key = settings.OPENAI_API_KEY
aimodel = settings.OPENAI_MODEL


def read_csv():
esat_list = list()

with open(FILE_PATH) as csv_file:
# Header : "Département","Dénomination","N° de Siret","Adresse"
csvreader = csv.DictReader(csv_file, delimiter=",")
for index, row in enumerate(csvreader):
esat_list.append(row)

return esat_list


class Command(BaseCommand):
"""
Usage: poetry run python manage.py import_esat_from_asp
"""

def handle(self, *args, **options):
print("-" * 80)
esat_list = read_csv()

print("Importing ESAT FROM ASP...")
progress = 0

already_exits = 0
address_changes = 0
news = 0

for index, esat in enumerate(esat_list):
progress += 1
if (progress % 50) == 0:
print(f"{progress}...")

esat_siret = esat["N° de Siret"]
esat_denom = esat["Dénomination"]
if siae := Siae.objects.filter(siret=esat_siret).first():
already_exits += 1

address_in_db = f"{siae.address} {siae.post_code} {siae.city}".strip().lower()
address_in_file = esat["Adresse"].strip().lower()

# IA used to check if the address has really changed
if address_in_file != address_in_db:
prompt = f'By answering yes or no, tell me if these two addresses, "{address_in_db}" and "{address_in_file}", refer to the same place ?'
messages = [{"role": "user", "content": prompt}]

has_answered = False
while not has_answered:
try:
chat_completion = openai.ChatCompletion.create(
model=aimodel, temperature=0.5, max_tokens=150, messages=messages, request_timeout=15
)
has_answered = True
result = chat_completion.to_dict_recursive()
if result["choices"][0]["message"]["content"].strip().startswith("No"):
address_changes += 1
self.update_esat_address(siae, address_in_file)
except: # noqa E722
print("OpenAI API Timeout, sleep before retry")
time.sleep(3)
print(f"{esat_denom} ({esat_siret}) addess change : {address_in_db} -> {address_in_file}")
else:
news += 1
print(f"{esat_denom} ({esat_siret}) is a new !")
self.import_esat(esat)

# avoid DDOSing APIs
time.sleep(1)

print(
f"Done with {already_exits} already_exits ({address_changes} addresses updated) and {news} new esat added."
)

def update_esat_address(self, siae, address):
geocoding_data = get_geocoding_data(address)
if geocoding_data:
print(geocoding_data)
siae.address = geocoding_data["address"]
siae.post_code = geocoding_data["post_code"]
siae.city = geocoding_data["city"]
siae.department = department_from_postcode(geocoding_data["post_code"])
siae.region = DEPARTMENT_TO_REGION[siae.department]
siae.coords = geocoding_data["coords"]
siae.save()
else:
print(f"Geocoding not found,{siae.name},{address}")

def import_esat(self, esat): # noqa C901
# store raw dict
esat["import_source"] = "esat_asp"
esat["import_raw_object"] = esat.copy()

# defaults
esat["kind"] = siae_constants.KIND_ESAT
esat["source"] = Siae.SOURCE_ESAT
esat["geo_range"] = siae_constants.GEO_RANGE_DEPARTMENT

# basic fields
rename_dict_key(esat, "Dénomination", "name")
esat["name"].strip()
esat["name"] = esat["name"].replace(" ", " ")
rename_dict_key(esat, "N° de Siret", "siret")
esat["siret_is_valid"] = True

# enrich with API Entreprise
etablissement, error = etablissement_get_or_error(
esat["siret"], reason="Mise à jour données Marché de la plateforme de l'Inclusion"
)
if etablissement:
print(etablissement)
esat["nature"] = Siae.NATURE_HEAD_OFFICE if etablissement["is_head_office"] else Siae.NATURE_ANTENNA
esat["is_active"] = False if etablissement["is_closed"] else True
esat["naf"] = etablissement["naf"]
if etablissement["employees"]:
esat["api_entreprise_employees"] = etablissement["employees"]
if etablissement["date_constitution"]:
esat["api_entreprise_date_constitution"] = etablissement["date_constitution"]

full_address = esat.pop("Adresse")

# create object
try:
print("Create new esat..")
[esat.pop(key) for key in ["import_source", "Département"]]
print(esat)
siae = Siae.objects.create(**esat)
self.update_esat_address(siae, full_address)
except Exception as e:
print(e)
print(esat)

# avoid DDOSing APIs
time.sleep(0.3)

0 comments on commit 41a612b

Please sign in to comment.