wip

StartupsPoleEmploi · JimmyDore · Jun 27, 2019 · Jun 28, 2019 · Jul 3, 2019 · Jul 5, 2019
commit 960aebfbdda46c3c19e7b2c7f5831378a22eba25
diff --git a/labonneboite/scripts/impact_retour_emploi/clean_tre.py b/labonneboite/scripts/impact_retour_emploi/clean_tre.py
@@ -1,15 +1,12 @@
+import urllib
+import shutil
+from os import makedirs, remove, listdir
+from os.path import abspath, exists
+from datetime import date
 import pandas as pd
-import matplotlib.pyplot as plt
-import matplotlib.dates as mdates
-import xlsxwriter
 import openpyxl
 import openpyxl.styles
 from sqlalchemy import create_engine
-import urllib
-import shutil
-from os import *
-from os.path import *
-from datetime import date
 import charts as charts
 import fr_charts as fr
 import grand_public as gd
@@ -33,8 +30,10 @@
 year_idpe = idpe_connect['YEAR(dateheure)'].tolist()
 
 month_year_idpe = []  # Formatting
-for i in range(len(month_idpe)):
+i=0
+while i < len(month_idpe):
     month_year_idpe.append(str(year_idpe[i])+'/'+str(month_idpe[i]))
+    i+=1
 idpe_connect['Date'] = month_year_idpe
 
 # Count all distinct IDPEC
@@ -59,15 +58,15 @@
 try:
     shutil.rmtree(path+'images/')
 except:
-    None
+    pass
 try:
     shutil.rmtree(path+'gd_pub/')
 except:
-    None
+    pass
 try:
     shutil.rmtree(path+'Clean/')
 except:
-    None
+    pass
 
 makedirs(path+'images/')
 makedirs(path+'gd_pub/')
@@ -101,8 +100,7 @@ def get_type_contrat(row):
         return 'CDD'
     elif row['dc_typecontrat_id'] == 2:
         return 'CDI'
-    else:
-        return 'CTT'
+    return 'CTT'
 
 
 act_dpae_2['type_contrat'] = act_dpae_2.apply(
@@ -135,8 +133,7 @@ def get_nbr_jours_act_emb(row):
 def get_priv_pub(row):
     if row['dc_privepublic'] == 0:
         return 'Public'
-    else:
-        return 'Prive'
+    return 'Prive'
 
 
 act_dpae_2['dc_privepublic'] = act_dpae_2.apply(
@@ -154,8 +151,7 @@ def good_format(row):
 def del_interrogation(row):
     if row['tranche_age'] == 'de 26 ans ? 50 ans':
         return 'entre 26 et 50 ans'
-    else:
-        return row['tranche_age']
+    return row['tranche_age']
 
 
 act_dpae_2['tranche_age'] = act_dpae_2.apply(
@@ -166,8 +162,7 @@ def del_cdd_incoherent(row):
     try:
         if int(row['duree_activite_cdd_jours']) > 1200:
             return 1
-        else:
-            return 0
+        return 0
     except:
         return 0
 
@@ -253,9 +248,9 @@ def location(num_image, file_name, link=False):  # Pasting of pictures
                ('08', "type_contrat", "Type de contrat obtenu", "type_cont_gd_public_pie", None): charts.Pie,
                ('10', "tranche_age", "Pourcentage des differentes tranches d'ages dans les DPAE", "age_gd_public_pie", None): charts.Pie,
                ('11', "dc_privepublic", "Pourcentage d'embauche dans le privé et dans le public", "prive_pub_gd_public_pie", None): charts.Pie,
-               ('12', "code_postal", "Part des DPAE par anciennes régions", "old_region_gd_public_svg", "old_region"): fr.map,
-               ('13', "code_postal", "Part des DPAE par nouvelles régions", "new_region_gd_public_svg", "new_region"): fr.map,
-               ('14', "code_postal", "Part des DPAE par département", "dep_gd_public_svg", "departement"): fr.map,
+               ('12', "code_postal", "Part des DPAE par anciennes régions", "old_region_gd_public_svg", "old_region"): fr.map_fr,
+               ('13', "code_postal", "Part des DPAE par nouvelles régions", "new_region_gd_public_svg", "new_region"): fr.map_fr,
+               ('14', "code_postal", "Part des DPAE par département", "dep_gd_public_svg", "departement"): fr.map_fr,
                ('15', 'date_embauche', all_the_names_1, 'cohorte_1_gd_public', 'date_activite'): charts.Stacked_Bar,
                ('16', 'date_activite', all_the_names_2, 'cohorte_2_gd_public', 'date_embauche'): charts.Stacked_Bar}
 
@@ -311,7 +306,7 @@ def location(num_image, file_name, link=False):  # Pasting of pictures
 
 ##################################################################################################
 
-num_image = 1
+num_im = 1
 package_svg = []
 all_stats = []
 
@@ -334,45 +329,45 @@ def location(num_image, file_name, link=False):  # Pasting of pictures
 
 # Iterate through the created images
 # Pasting of charts from the directory
-for file_name in sorted(listdir(path+'images/')):
+for filename in sorted(listdir(path+'images/')):
 
-    img = openpyxl.drawing.image.Image(path+'images/'+file_name)
+    img = openpyxl.drawing.image.Image(path+'images/'+filename)
 
-    if "gd_public" in file_name:
-        shutil.copyfile(path+'images/'+file_name, path+'gd_pub/'+file_name)
+    if "gd_public" in filename:
+        shutil.copyfile(path+'images/'+filename, path+'gd_pub/'+filename)
 
-    if "table" in file_name:  # it's the table of cohorte --> it's a different size
+    if "table" in filename:  # it's the table of cohorte --> it's a different size
         img.anchor = 'H1'
         img.height = 750
         img.width = 900
     else:
         # using the function location in order to place the charts
-        img.anchor = location(num_image, file_name)
+        img.anchor = location(num_im, filename)
         img.height = 400
         img.width = 500
 
     ws.add_image(img)  # Pasting
 
     # if it's map --> pasting web link below charts
-    if exists(path+file_name[:-3]+'svg'):
-        cells_link = ws[location(num_image, file_name, True)]
-        cells_link.hyperlink = file_name[:-3]+'svg'
+    if exists(path+filename[:-3]+'svg'):
+        cells_link = ws[location(num_im, filename, True)]
+        cells_link.hyperlink = filename[:-3]+'svg'
         cells_link.font = openpyxl.styles.Font(
             size=5.5, italic=True, underline='single')
         cells_link.alignment = openpyxl.styles.Alignment(horizontal="center")
-        package_svg.append((path, file_name[:-3]+'svg'))
+        package_svg.append((path, filename[:-3]+'svg'))
 
-    num_image += 1
+    num_im += 1
 
     # if it's the last charts of the sheet --> change sheet
-    if num_image == (sheet_sizes[num_sheet]+1):
+    if num_im == (sheet_sizes[num_sheet]+1):
         try:
             num_sheet += 1
             book.create_sheet(sheet_names[num_sheet])
             ws = book.worksheets[num_sheet]
-            num_image = 0
+            num_im = 0
         except:
-            None
+            pass
 
 book.save('Impact_lbb_DPAE.xlsx')
 
@@ -392,11 +387,11 @@ def location(num_image, file_name, link=False):  # Pasting of pictures
                 'Clean/'+'Impact_lbb_DPAE.xlsx')
 for path, svg in package_svg:
     shutil.copyfile(path+svg, path+'Clean/'+svg)
-remove("filename.html")
+remove("table.html")
 for last_files in listdir(path):
     try:
         extension = last_files[last_files.index('.'):]
         if extension == '.svg' or extension == '.xlsx':
             remove(last_files)
     except:
-        None  # It's a directory
+        pass  # It's a directory
diff --git a/labonneboite/scripts/impact_retour_emploi/daily_json_activity_parser.py b/labonneboite/scripts/impact_retour_emploi/daily_json_activity_parser.py
@@ -1,58 +1,98 @@
-import json
 import urllib.parse
 import os
+import json
 import pandas as pd
-from sqlalchemy import create_engine
 
-engine = create_engine('mysql://labonneboite:%s@127.0.0.1:3306/labonneboite' %urllib.parse.quote_plus('LaB@nneB@ite'))
-engine.connect()
+from labonneboite.importer import util as import_util
+from labonneboite.importer import settings as importer_settings
+
+create_table_query1 = 'CREATE TABLE IF NOT EXISTS `idpe_connect` ( \
+                            `idutilisateur_peconnect` text, \
+                            `dateheure` text \
+                        ) ENGINE=InnoDB DEFAULT CHARSET=utf8;'
+
+create_table_query2 = 'CREATE TABLE IF NOT EXISTS `activity_logs` ( \
+                        `dateheure` text,\
+                        `nom` text,\
+                        `idutilisateur_peconnect` text,\
+                        `siret` text\
+                        ) ENGINE=InnoDB DEFAULT CHARSET=utf8;'
+
+last_date_query = 'SELECT dateheure \
+                   FROM idpe_connect \
+                   ORDER BY dateheure DESC \
+                   LIMIT 1'
+
+con, cur = import_util.create_cursor()
+cur.execute(create_table_query1)
+cur.execute(create_table_query2)
+cur.execute(last_date_query)
+row = cur.fetch()
+cur.close()
+con.close()
+
+data = []
 
-data=[]
+# FIXME : Later, we'll be able to get datas, directly from PE datalake
+# Now we have a cron task which will cpy json activity logs to /srv/lbb/data
+json_logs_folder_path = importer_settings.INPUT_SOURCE_FOLDER
+json_logs_paths = os.listdir(json_logs_folder_path)
+json_logs_paths = [i for i in json_logs_paths if i.startswith('activity')]
 
-json_path='/mnt/datalakepe/vers_datalake/activity/'
-#json_path='/home/ads/Documents/labonneboite/labonneboite/importer/jobs/scripts_tre/'
-liste_files=os.listdir(json_path)
-liste_files.sort()
-json_files=liste_files[-1]
+for json_logs_path in json_logs_paths:
+
 
-with open(json_path+json_files,'r') as json_files:
-    for line in json_files:
+with open(json_path+last_json, 'r') as json_file:
+    for line in json_file:
         data.append(line)
-activity_dico={}
-i=1
-for line in data:
-    activity_dico[str(i)]=json.loads(line)
-    i+=1
+activities = {}
+i = 1
+for activity in data:
+    activities[str(i)] = json.loads(activity)
+    i += 1
+
+activity_df = pd.DataFrame.from_dict(activities).transpose()
 
-table_activity=pd.DataFrame.from_dict(activity_dico).transpose()
 
 def idpe_only(row):
-    if row['idutilisateur-peconnect']==None:
+    if row['idutilisateur-peconnect'] is None:
         return 0
-    else : return 1
+    return 1
+
+
+activity_df['tri_idpec'] = activity_df.apply(
+    lambda row: idpe_only(row), axis=1)
+activity_df = activity_df[activity_df.tri_idpec != 0]
+activity_idpec = activity_df.drop_duplicates(
+    subset=['idutilisateur-peconnect'], keep='first')
 
-table_activity['provisoire']=table_activity.apply(lambda row:idpe_only(row),axis=1)
-table_activity=table_activity[table_activity.provisoire != 0]
-table_activity_2_bis= table_activity.drop_duplicates(subset=['idutilisateur-peconnect'], keep='first')
+activity_idpec = activity_idpec[[
+    'dateheure', 'idutilisateur-peconnect']]
+activity_idpec.to_sql(
+    con=engine, name='idpe_connect', if_exists='append', index=False)
 
-table_activity_2_bis=table_activity_2_bis[['dateheure','idutilisateur-peconnect']]
-table_activity_2_bis.to_sql(con=engine, name='idpe_connect', if_exists='append',index=False)
+cliks_of_interest = ['details', 'afficher-details',
+                     'telecharger-pdf', 'ajout-favori']
 
-cliks_of_interest=['details','afficher-details','telecharger-pdf','ajout-favori']
 
-def tri_nom(row):
+def tri_names(row):
     if row['nom'] in cliks_of_interest:
         return True
-    else : 
-        return False
+    return False
+
+
+activity_df['tri_names'] = activity_df.apply(
+    lambda row: tri_names(row), axis=1)
+activity_logs = activity_df[activity_df.tri_names is True]
 
-table_activity['to_tej'] = table_activity.apply(lambda row: tri_nom(row), axis=1)
-table_activity_2 = table_activity[table_activity.to_tej == True]
 
 def siret(row):
     return row['proprietes']['siret']
 
-table_activity_2['siret'] = table_activity_2.apply(lambda row: siret(row), axis=1)
-cols_of_interest=["dateheure","nom","idutilisateur-peconnect","siret"]
-table_activity_3=table_activity_2[cols_of_interest]
-table_activity_3.to_sql(con=engine, name='activity_logs', if_exists='append',index=False)
+
+activity_logs['siret'] = activity_logs.apply(
+    lambda row: siret(row), axis=1)
+cols_of_interest = ["dateheure", "nom", "idutilisateur-peconnect", "siret"]
+act_logs_good = activity_logs[cols_of_interest]
+act_logs_good.to_sql(con=engine, name='activity_logs',
+                        if_exists='append', index=False)