From 39ab51bb485845dc9d0f40ff0cac4fc63ea9b063 Mon Sep 17 00:00:00 2001 From: Paul Girard Date: Thu, 19 Feb 2015 18:11:41 +0100 Subject: [PATCH] added Total_type World total trade special case management related to #1 --- database_creation/mdb_to_sqlite.py | 50 ++++++++++++++++-------- database_creation/test/tests.py | 4 ++ database_creation/test/total_type.py | 58 ++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 database_creation/test/total_type.py diff --git a/database_creation/mdb_to_sqlite.py b/database_creation/mdb_to_sqlite.py index 193e1279..55719969 100644 --- a/database_creation/mdb_to_sqlite.py +++ b/database_creation/mdb_to_sqlite.py @@ -5,6 +5,7 @@ import json + try : conf=json.load(open("config.json","r")) except : @@ -34,10 +35,14 @@ # print new_table_name #c.executescript() sql=subprocess.check_output("mdb-export -I sqlite %s '%s'"%(mdb_filename,table),shell=True) - print "%s: got %s lines of sql"%(table,len(sql.split("\n"))) + print "%s: got %s lines of sql"%(table,len(sql.split(";\n"))) #c.execute("BEGIN TRANSACTION") - for insert in sql.split("\n"): - c.execute(insert) + for insert in sql.split(";\n"): + try : + c.execute(insert) + except Exception as e: + print "'%s'"%insert + raise e #c.execute("END") print "inserts done" @@ -67,6 +72,12 @@ # clean Land/Sea c.execute("UPDATE `flow` SET `Land/Sea` = null WHERE `Land/Sea` = ' '") +#clean total type +c.execute("UPDATE `flow` SET `Total_type` = lower(`Total_type`) WHERE `Total_type` is not null") + + + + # RICENTITIES # add a slug as RICentities id @@ -76,9 +87,7 @@ c.execute("""UPDATE RICentities SET id=REPLACE(id,"(","") WHERE 1""") c.execute("""UPDATE RICentities SET id=REPLACE(id,")","") WHERE 1""") c.execute("""UPDATE RICentities SET id=REPLACE(id,"***","") WHERE 1""") -#create indeces -c.execute("""CREATE UNIQUE INDEX i_re_id ON RICentities (id)""") -c.execute("""CREATE INDEX i_re_rn ON RICentities (RICname)""") + # remove 770 'Pas de données' : a priori on tente de les garder @@ -118,18 +127,21 @@ LEFT OUTER JOIN entity_names_cleaning as p ON trim(`Partner Entity_Original Name`)=p.original_name COLLATE NOCASE LEFT OUTER JOIN RICentities p2 ON p2.RICname=p.RICname WHERE - `Partner Entity_Sum` is null - and ((`Total Trade Estimation` is null and partner != "World" )or(`Total Trade Estimation`=1 and partner = "World")) + `Partner Entity_Sum` is null and partner is not null and expimp != "Re-exp" """) -# INDEX -c.execute("""CREATE INDEX i_rid ON flow_joined (reporting_id)""") -c.execute("""CREATE INDEX i_pid ON flow_joined (partner_id)""") -c.execute("""CREATE INDEX i_yr ON flow_joined (Yr)""") -c.execute("""CREATE INDEX i_r ON flow_joined (reporting)""") -c.execute("""CREATE INDEX i_p ON flow_joined (partner)""") +# taking care of Total_type flag to define the world partner +# and ((`Total Trade Estimation` is null and partner != "World" )or(`Total Trade Estimation`=1 and partner = "World")) +c.execute("""INSERT INTO RICentities (`id`,`RICname`,`type`,`continent`) VALUES ("Worldestimated","World_estimated","geographical_area","World")""") +c.execute("""UPDATE flow_joined SET partner="World_estimated", partner_id="Worldestimated" WHERE partner="World" and Total_type="total_estimated" """) +c.execute("""INSERT INTO RICentities (`id`,`RICname`,`type`,`continent`) VALUES ("Worldasreported","World_as_reported","geographical_area","World")""") +c.execute("""UPDATE flow_joined SET partner="World_as_reported", partner_id="Worldasreported" WHERE partner="World" and Total_type="total_reporting1" """) +c.execute("""INSERT INTO RICentities (`id`,`RICname`,`type`,`continent`) VALUES ("Worldasreported2","World_as_reported2","geographical_area","World")""") +c.execute("""UPDATE flow_joined SET partner="World_as_reported2", partner_id="Worldasreported2" WHERE partner="World" and Total_type="total_reporting2" """) +c.execute("""INSERT INTO RICentities (`id`,`RICname`,`type`,`continent`) VALUES ("Worldundefined","World_undefined","geographical_area","World")""") +c.execute("""UPDATE flow_joined SET partner="World_undefined", partner_id="Worldundefined" WHERE partner="World" and Total_type is null """) @@ -270,8 +282,14 @@ print ("removing %s Gen duplicates for %s"%(r,len(ids))).encode("utf8") c.execute("DELETE FROM flow_joined WHERE id IN (%s)"%",".join(ids)) - - +# INDEX +c.execute("""CREATE INDEX i_rid ON flow_joined (reporting_id)""") +c.execute("""CREATE INDEX i_pid ON flow_joined (partner_id)""") +c.execute("""CREATE INDEX i_yr ON flow_joined (Yr)""") +c.execute("""CREATE INDEX i_r ON flow_joined (reporting)""") +c.execute("""CREATE INDEX i_p ON flow_joined (partner)""") +c.execute("""CREATE UNIQUE INDEX i_re_id ON RICentities (id)""") +c.execute("""CREATE INDEX i_re_rn ON RICentities (RICname)""") print "cleaning done" conn.commit() diff --git a/database_creation/test/tests.py b/database_creation/test/tests.py index 418376d3..7ba53176 100644 --- a/database_creation/test/tests.py +++ b/database_creation/test/tests.py @@ -31,6 +31,10 @@ flow_status="OK" if flow.test(cursor) else "FAILED" print "FLOW TEST : %s"%flow_status +import total_type +total_type_status="OK" if total_type.test(cursor) else "FAILED" +print "TOTAL_TYPE TEST : %s"%total_type_status + # c.execute("""SELECT `Reporting Entity_Original Name` # FROM flow diff --git a/database_creation/test/total_type.py b/database_creation/test/total_type.py new file mode 100644 index 00000000..11fd8817 --- /dev/null +++ b/database_creation/test/total_type.py @@ -0,0 +1,58 @@ +import codecs +import os + +import json +import sqlite3 + + +def test(cursor): + cursor.execute("""SELECT count(*) + FROM flow_joined + WHERE partner="World_undefined" + """) + print "%s undefined type of Total Trade to World"%cursor.fetchone() + + cursor.execute("""SELECT count(*),sum(nb) FROM (SELECT count(*) as nb + FROM flow_joined + group by reporting,Yr,expimp) + """) + nb_reporting_annual_flows, total_flows=cursor.fetchone() + print "%s number of reporting exp or imp total annual flows on %s total"%(nb_reporting_annual_flows,total_flows) + + cursor.execute("""SELECT world_type_group, sum(nb) as sum FROM (SELECT count(*) as nb,group_concat(partner,"|") as world_type_group + FROM flow_joined + WHERE partner IN ("World_undefined","World_estimated","World_as_reported","World_as_reported2") + group by reporting,Yr,expimp ) + group by world_type_group + ORDER BY sum DESC + """) + print "\nrepartition of type of Total Trade to World as duplicates:" + missing_world_flows_worldview=0 + missing_world_flows_countryview=0 + undefined=0 + + for world_type_group,nb in cursor: + print world_type_group,nb + if "World_estimated" in world_type_group or "World_as_reported" in world_type_group: + missing_world_flows_worldview+=nb + if "World_as_reported2" in world_type_group or "World_as_reported" in world_type_group: + missing_world_flows_countryview+=nb + if world_type_group == "World_undefined": + undefined+=nb + print "\n%s %.1f%% flows compatible with world view"%(missing_world_flows_worldview,100*float(missing_world_flows_worldview)/nb_reporting_annual_flows) + print "%s %.1f%% flows compatible with country view"%(missing_world_flows_countryview,100*float(missing_world_flows_countryview)/nb_reporting_annual_flows) + print "%s Total trade to World flows with no type"%undefined + + print "\nisolating World_undefined-only flows :" + + cursor.execute("""SELECT reporting,count(*),group_concat(Yr,"|") + FROM (SELECT reporting,Yr,group_concat(partner) as partners_group + FROM flow_joined + WHERE partner IN ("World_undefined","World_estimated","World_as_reported","World_as_reported2") + group by reporting,Yr,expimp ) + WHERE partners_group="World_undefined" + group by reporting + """) + for f in cursor: + print "Reporting: %s, %s cases, years: %s"%f + return True