Skip to content
This repository has been archived by the owner on Nov 15, 2024. It is now read-only.

WIP : Jimmyd/impact retour emploi #384

Open
wants to merge 31 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
841a83d
WIP Add Joris work about impact retour emploi
JimmyDore Jun 27, 2019
c51259f
Update setup.py to create executables in virtualenv for ire scripts
JimmyDore Jun 28, 2019
960aebf
wip
JimmyDore Jul 3, 2019
389034d
Industrialize daily copy script
JimmyDore Jul 5, 2019
a5f6d0f
Fix scripts launcher
JimmyDore Jul 5, 2019
5589f81
Add logs informations
JimmyDore Jul 5, 2019
7923a64
Add Exception for the daily parser script
JimmyDore Jul 5, 2019
f6f36b2
Clean and prepare jobs join & clean activity_logs-dpae for Jenkins
JimmyDore Jul 9, 2019
04902f2
Remove debug mode
JimmyDore Jul 9, 2019
d4e8756
Add log about size of DPAE file
JimmyDore Jul 9, 2019
c333095
wip make report
JimmyDore Jul 10, 2019
22aaf5f
Fix (approximately) issues with path
JimmyDore Jul 10, 2019
adfbcb1
Fix last problem with path
JimmyDore Jul 10, 2019
b3693ce
Add settings file with different paths
JimmyDore Jul 10, 2019
17e4c6f
Fix import module charts
JimmyDore Jul 10, 2019
0292e93
Add useful libs to install in DockerFile
JimmyDore Jul 11, 2019
a503443
Add xvfb to run imgkit from Docker image
JimmyDore Jul 11, 2019
bf21e56
Add comments on main script to make charts and excel report
JimmyDore Jul 12, 2019
5d439e3
Update name of DPAE file to be used
JimmyDore Sep 23, 2019
6ff55df
Add function to parse activity logs for PSE study
JimmyDore Nov 13, 2019
571b82f
Update the way to check if a file needs to be used or not
JimmyDore Nov 20, 2019
5299955
Add option to join data on SIREN (or SIRET as before)
JimmyDore Dec 18, 2019
98a068b
Remove debug mode
JimmyDore Dec 18, 2019
cd71c44
Fix import
JimmyDore Dec 18, 2019
d2e40a3
Fix check existence of csv generated file
JimmyDore Dec 18, 2019
a68b5ed
Fix SIREN issue int/str
JimmyDore Dec 24, 2019
5ab18af
Fix types of columns siren/siret
JimmyDore Dec 26, 2019
e9c9653
Fix pandas bug
JimmyDore Dec 26, 2019
ccf6a21
Try with SIRET to compare data
JimmyDore Dec 26, 2019
fc35a7a
Fix path to dpae file
JimmyDore Dec 26, 2019
43d3b82
Fix siren bug
Feb 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
wip
JimmyDore authored and dejafait committed Jan 3, 2020
commit 960aebfbdda46c3c19e7b2c7f5831378a22eba25
75 changes: 35 additions & 40 deletions labonneboite/scripts/impact_retour_emploi/clean_tre.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import urllib
import shutil
from os import makedirs, remove, listdir
from os.path import abspath, exists
from datetime import date
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import xlsxwriter
import openpyxl
import openpyxl.styles
from sqlalchemy import create_engine
import urllib
import shutil
from os import *
from os.path import *
from datetime import date
import charts as charts
import fr_charts as fr
import grand_public as gd
@@ -33,8 +30,10 @@
year_idpe = idpe_connect['YEAR(dateheure)'].tolist()

month_year_idpe = [] # Formatting
for i in range(len(month_idpe)):
i=0
while i < len(month_idpe):
month_year_idpe.append(str(year_idpe[i])+'/'+str(month_idpe[i]))
i+=1
idpe_connect['Date'] = month_year_idpe

# Count all distinct IDPEC
@@ -59,15 +58,15 @@
try:
shutil.rmtree(path+'images/')
except:
None
pass
try:
shutil.rmtree(path+'gd_pub/')
except:
None
pass
try:
shutil.rmtree(path+'Clean/')
except:
None
pass

makedirs(path+'images/')
makedirs(path+'gd_pub/')
@@ -101,8 +100,7 @@ def get_type_contrat(row):
return 'CDD'
elif row['dc_typecontrat_id'] == 2:
return 'CDI'
else:
return 'CTT'
return 'CTT'


act_dpae_2['type_contrat'] = act_dpae_2.apply(
@@ -135,8 +133,7 @@ def get_nbr_jours_act_emb(row):
def get_priv_pub(row):
if row['dc_privepublic'] == 0:
return 'Public'
else:
return 'Prive'
return 'Prive'


act_dpae_2['dc_privepublic'] = act_dpae_2.apply(
@@ -154,8 +151,7 @@ def good_format(row):
def del_interrogation(row):
if row['tranche_age'] == 'de 26 ans ? 50 ans':
return 'entre 26 et 50 ans'
else:
return row['tranche_age']
return row['tranche_age']


act_dpae_2['tranche_age'] = act_dpae_2.apply(
@@ -166,8 +162,7 @@ def del_cdd_incoherent(row):
try:
if int(row['duree_activite_cdd_jours']) > 1200:
return 1
else:
return 0
return 0
except:
return 0

@@ -253,9 +248,9 @@ def location(num_image, file_name, link=False): # Pasting of pictures
('08', "type_contrat", "Type de contrat obtenu", "type_cont_gd_public_pie", None): charts.Pie,
('10', "tranche_age", "Pourcentage des differentes tranches d'ages dans les DPAE", "age_gd_public_pie", None): charts.Pie,
('11', "dc_privepublic", "Pourcentage d'embauche dans le privé et dans le public", "prive_pub_gd_public_pie", None): charts.Pie,
('12', "code_postal", "Part des DPAE par anciennes régions", "old_region_gd_public_svg", "old_region"): fr.map,
('13', "code_postal", "Part des DPAE par nouvelles régions", "new_region_gd_public_svg", "new_region"): fr.map,
('14', "code_postal", "Part des DPAE par département", "dep_gd_public_svg", "departement"): fr.map,
('12', "code_postal", "Part des DPAE par anciennes régions", "old_region_gd_public_svg", "old_region"): fr.map_fr,
('13', "code_postal", "Part des DPAE par nouvelles régions", "new_region_gd_public_svg", "new_region"): fr.map_fr,
('14', "code_postal", "Part des DPAE par département", "dep_gd_public_svg", "departement"): fr.map_fr,
('15', 'date_embauche', all_the_names_1, 'cohorte_1_gd_public', 'date_activite'): charts.Stacked_Bar,
('16', 'date_activite', all_the_names_2, 'cohorte_2_gd_public', 'date_embauche'): charts.Stacked_Bar}

@@ -311,7 +306,7 @@ def location(num_image, file_name, link=False): # Pasting of pictures

##################################################################################################

num_image = 1
num_im = 1
package_svg = []
all_stats = []

@@ -334,45 +329,45 @@ def location(num_image, file_name, link=False): # Pasting of pictures

# Iterate through the created images
# Pasting of charts from the directory
for file_name in sorted(listdir(path+'images/')):
for filename in sorted(listdir(path+'images/')):

img = openpyxl.drawing.image.Image(path+'images/'+file_name)
img = openpyxl.drawing.image.Image(path+'images/'+filename)

if "gd_public" in file_name:
shutil.copyfile(path+'images/'+file_name, path+'gd_pub/'+file_name)
if "gd_public" in filename:
shutil.copyfile(path+'images/'+filename, path+'gd_pub/'+filename)

if "table" in file_name: # it's the table of cohorte --> it's a different size
if "table" in filename: # it's the table of cohorte --> it's a different size
img.anchor = 'H1'
img.height = 750
img.width = 900
else:
# using the function location in order to place the charts
img.anchor = location(num_image, file_name)
img.anchor = location(num_im, filename)
img.height = 400
img.width = 500

ws.add_image(img) # Pasting

# if it's map --> pasting web link below charts
if exists(path+file_name[:-3]+'svg'):
cells_link = ws[location(num_image, file_name, True)]
cells_link.hyperlink = file_name[:-3]+'svg'
if exists(path+filename[:-3]+'svg'):
cells_link = ws[location(num_im, filename, True)]
cells_link.hyperlink = filename[:-3]+'svg'
cells_link.font = openpyxl.styles.Font(
size=5.5, italic=True, underline='single')
cells_link.alignment = openpyxl.styles.Alignment(horizontal="center")
package_svg.append((path, file_name[:-3]+'svg'))
package_svg.append((path, filename[:-3]+'svg'))

num_image += 1
num_im += 1

# if it's the last charts of the sheet --> change sheet
if num_image == (sheet_sizes[num_sheet]+1):
if num_im == (sheet_sizes[num_sheet]+1):
try:
num_sheet += 1
book.create_sheet(sheet_names[num_sheet])
ws = book.worksheets[num_sheet]
num_image = 0
num_im = 0
except:
None
pass

book.save('Impact_lbb_DPAE.xlsx')

@@ -392,11 +387,11 @@ def location(num_image, file_name, link=False): # Pasting of pictures
'Clean/'+'Impact_lbb_DPAE.xlsx')
for path, svg in package_svg:
shutil.copyfile(path+svg, path+'Clean/'+svg)
remove("filename.html")
remove("table.html")
for last_files in listdir(path):
try:
extension = last_files[last_files.index('.'):]
if extension == '.svg' or extension == '.xlsx':
remove(last_files)
except:
None # It's a directory
pass # It's a directory
Original file line number Diff line number Diff line change
@@ -1,58 +1,98 @@
import json
import urllib.parse
import os
import json
import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Saut de ligne avant car pandas est une lib externe.

from sqlalchemy import create_engine

engine = create_engine('mysql://labonneboite:%s@127.0.0.1:3306/labonneboite' %urllib.parse.quote_plus('LaB@nneB@ite'))
engine.connect()
from labonneboite.importer import util as import_util
from labonneboite.importer import settings as importer_settings

create_table_query1 = 'CREATE TABLE IF NOT EXISTS `idpe_connect` ( \
`idutilisateur_peconnect` text, \
`dateheure` text \
) ENGINE=InnoDB DEFAULT CHARSET=utf8;'

create_table_query2 = 'CREATE TABLE IF NOT EXISTS `activity_logs` ( \
`dateheure` text,\
`nom` text,\
`idutilisateur_peconnect` text,\
`siret` text\
) ENGINE=InnoDB DEFAULT CHARSET=utf8;'

last_date_query = 'SELECT dateheure \
FROM idpe_connect \
ORDER BY dateheure DESC \
LIMIT 1'

con, cur = import_util.create_cursor()
cur.execute(create_table_query1)
cur.execute(create_table_query2)
cur.execute(last_date_query)
row = cur.fetch()
cur.close()
con.close()

data = []

data=[]
# FIXME : Later, we'll be able to get datas, directly from PE datalake
# Now we have a cron task which will cpy json activity logs to /srv/lbb/data
json_logs_folder_path = importer_settings.INPUT_SOURCE_FOLDER
json_logs_paths = os.listdir(json_logs_folder_path)
json_logs_paths = [i for i in json_logs_paths if i.startswith('activity')]

json_path='/mnt/datalakepe/vers_datalake/activity/'
#json_path='/home/ads/Documents/labonneboite/labonneboite/importer/jobs/scripts_tre/'
liste_files=os.listdir(json_path)
liste_files.sort()
json_files=liste_files[-1]
for json_logs_path in json_logs_paths:


with open(json_path+json_files,'r') as json_files:
for line in json_files:
with open(json_path+last_json, 'r') as json_file:
for line in json_file:
data.append(line)
activity_dico={}
i=1
for line in data:
activity_dico[str(i)]=json.loads(line)
i+=1
activities = {}
i = 1
for activity in data:
activities[str(i)] = json.loads(activity)
i += 1

activity_df = pd.DataFrame.from_dict(activities).transpose()

table_activity=pd.DataFrame.from_dict(activity_dico).transpose()

def idpe_only(row):
if row['idutilisateur-peconnect']==None:
if row['idutilisateur-peconnect'] is None:
return 0
else : return 1
return 1


activity_df['tri_idpec'] = activity_df.apply(
lambda row: idpe_only(row), axis=1)
activity_df = activity_df[activity_df.tri_idpec != 0]
activity_idpec = activity_df.drop_duplicates(
subset=['idutilisateur-peconnect'], keep='first')

table_activity['provisoire']=table_activity.apply(lambda row:idpe_only(row),axis=1)
table_activity=table_activity[table_activity.provisoire != 0]
table_activity_2_bis= table_activity.drop_duplicates(subset=['idutilisateur-peconnect'], keep='first')
activity_idpec = activity_idpec[[
'dateheure', 'idutilisateur-peconnect']]
activity_idpec.to_sql(
con=engine, name='idpe_connect', if_exists='append', index=False)

table_activity_2_bis=table_activity_2_bis[['dateheure','idutilisateur-peconnect']]
table_activity_2_bis.to_sql(con=engine, name='idpe_connect', if_exists='append',index=False)
cliks_of_interest = ['details', 'afficher-details',
'telecharger-pdf', 'ajout-favori']

cliks_of_interest=['details','afficher-details','telecharger-pdf','ajout-favori']

def tri_nom(row):
def tri_names(row):
if row['nom'] in cliks_of_interest:
return True
else :
return False
return False


activity_df['tri_names'] = activity_df.apply(
lambda row: tri_names(row), axis=1)
activity_logs = activity_df[activity_df.tri_names is True]

table_activity['to_tej'] = table_activity.apply(lambda row: tri_nom(row), axis=1)
table_activity_2 = table_activity[table_activity.to_tej == True]

def siret(row):
return row['proprietes']['siret']

table_activity_2['siret'] = table_activity_2.apply(lambda row: siret(row), axis=1)
cols_of_interest=["dateheure","nom","idutilisateur-peconnect","siret"]
table_activity_3=table_activity_2[cols_of_interest]
table_activity_3.to_sql(con=engine, name='activity_logs', if_exists='append',index=False)

activity_logs['siret'] = activity_logs.apply(
lambda row: siret(row), axis=1)
cols_of_interest = ["dateheure", "nom", "idutilisateur-peconnect", "siret"]
act_logs_good = activity_logs[cols_of_interest]
act_logs_good.to_sql(con=engine, name='activity_logs',
if_exists='append', index=False)
Loading