updating covid policy app

open-risk · Sep 16, 2024 · d05bc34 · d05bc34
1 parent f132257
commit d05bc34
Show file tree

Hide file tree

Showing 9 changed files with 1,067 additions and 193 deletions.
diff --git a/policy/management/commands/covid_codebook.md b/policy/management/commands/covid_codebook.md
diff --git a/...nt/commands/create_policy_dataflow_dir.py → ...mands/create_covid_policy_dataflow_dir.py b/...nt/commands/create_policy_dataflow_dir.py → ...mands/create_covid_policy_dataflow_dir.py
@@ -35,7 +35,7 @@
 
 
 class Command(BaseCommand):
-    help = 'Create policy data dataflow directories'
+    help = 'Create covid policy data dataflow directories'
     Debug = False
     Logging = True
 

diff --git a/.../commands/create_policy_dimension_dict.py → ...nds/create_covid_policy_dimension_dict.py b/.../commands/create_policy_dimension_dict.py → ...nds/create_covid_policy_dimension_dict.py
@@ -55,8 +55,8 @@
 
 class Command(BaseCommand):
     help = 'Create policy dimension dictionary'
-    Debug = False
-    Logging = True
+    Debug = True
+    Logging = False
 
     start_time = time.time()
     start_timestamp = datetime.isoformat(datetime.now())
@@ -76,6 +76,7 @@ class Command(BaseCommand):
     dataflow_dict = pickle.load(open(datapath + '/dataflow_dict' + '.pkl', 'rb'))
 
     actual_codes = {}
+    print(len(field_codes), len(field_description))
     for field in field_codes:
         f_index = field_codes.index(field)
         actual_codes[field] = field_description[f_index]

diff --git a/...nagement/commands/download_policy_data.sh → ...nt/commands/download_covid_policy_data.sh b/...nagement/commands/download_policy_data.sh → ...nt/commands/download_covid_policy_data.sh
@@ -1,9 +1,14 @@
 #!/bin/bash
-wget 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
+
+# Download Raw data from OxCGRT github repo (final datasets)
+# https://github.com/OxCGRT/covid-policy-dataset/blob/main/data/OxCGRT_compact_national_v1.csv
+# https://github.com/OxCGRT/covid-policy-dataset/blob/main/data/OxCGRT_compact_subnational_v1.csv
+
 DAY=$(date -d "today" +"%d-%m-%Y")
 FILE="Oxford_Policies_Report_"$DAY".csv"
 cp OxCGRT_latest.csv ../../policy_data/Oxford_Policies_Report_Latest.csv
 mv OxCGRT_latest.csv ../../policy_data/$FILE
+
 echo "================================================================================" > ../../policy_data/Logs/processing.log
 echo "Downloading Oxford dataset as of "$DAY >> ../../policy_data/Logs/processing.log
 echo "================================================================================" >> ../../policy_data/Logs/processing.log

diff --git a/...ment/commands/extract_policy_dataflows.py → ...ommands/extract_covid_policy_dataflows.py b/...ment/commands/extract_policy_dataflows.py → ...ommands/extract_covid_policy_dataflows.py
@@ -33,8 +33,10 @@
     "AF": "Afghanistan",
 
 Updated at 2/19/21 to productionize
+Updated at 9/16/24 to final dataset
 """
 
+
 import json
 import pickle
 import time

diff --git a/...ent/commands/extract_policy_dataseries.py → ...mmands/extract_covid_policy_dataseries.py b/...ent/commands/extract_policy_dataseries.py → ...mmands/extract_covid_policy_dataseries.py
@@ -53,9 +53,9 @@
 
 
 class Command(BaseCommand):
-    help = 'Extract policy dataseries metadata from csv'
-    Debug = False
-    Logging = True
+    help = 'Extract covid policy dataseries metadata from csv'
+    Debug = True
+    Logging = False
     dataflowpath = settings.DATA_PATH + 'dataflows/'
 
     start_time = time.time()
@@ -72,7 +72,7 @@ class Command(BaseCommand):
     count = 0
     filepath = settings.CSV_FILE_PATH
     mydata = pd.read_csv(filepath)
-    mydata = mydata.fillna(0)
+    mydata = mydata.fillna(0.0)
     total_rows = mydata.shape[0]
 
     if Logging:
@@ -107,7 +107,7 @@ class Command(BaseCommand):
         # Construct collection of identifiers for this dataflow
         values = []
         identifier = []
-        # ATTN WE FILL NAN WITH ZERO
+        # ATTN WE FILL NAN's WITH ZEROS
         for field in field_names:
             values.append(row[field])
             f_index = field_names.index(field)

diff --git a/policy/management/commands/process_policy.py → ...nagement/commands/process_covid_policy.py b/policy/management/commands/process_policy.py → ...nagement/commands/process_covid_policy.py
@@ -69,9 +69,9 @@
 
 
 class Command(BaseCommand):
-    help = 'Process policy dataseries'
-    Debug = True
-    Logging = True
+    help = 'Process covid policy dataseries'
+    Debug = False
+    Logging = False
 
     datapath = settings.DATA_PATH
 
@@ -85,7 +85,7 @@ class Command(BaseCommand):
         logfile.write('> Starting at: ' + str(date) + '\n')
 
     if Debug:
-        print('> Script 3: Process Oxford Policy Data')
+        print('> Process Oxford Covid Policy Data')
         print('> ' + str(date) + '\n')
 
     # Read the list of downloaded dataseries
@@ -152,76 +152,82 @@ class Command(BaseCommand):
         Data['Long Description'] = field_description_long[field_index]
         Data['Code List'] = field_code_list[field_id]
         Data['Field Type'] = field_type[field_index]
-        # print(field_index, Data['Description'], Data['Long Description'])
 
         # Load and parse the saved JSON files
 
-        input_file = str(datapath) + '/policy/policy_data/dataflows/' + dataflow + '/' + series_id + '.json'
+        input_file = str(datapath) + '/dataflows/' + dataflow + '/' + series_id + '.json'
 
         Dates = []
         Values = []
+
         # Try to load the data
         try:
             mydata = json.load(open(input_file))
             Dates = mydata['Dates']
             Values = mydata['Values']
+            if Debug:
+                print('Parsed ' + series_id + '\n')
         except Exception as e:
-            logfile.write('Could not load/parse ' + series_id + '\n')
-            continue
+            if Logging:
+                logfile.write('Could not load/parse ' + series_id + '\n')
+            else:
+                print('Could not load/parse ' + series_id + '\n')
 
         # Total number of observations
         ObsCount = len(Values)
-        # Actual measurements (excluding NaN)
-        ValidCount = np.count_nonzero(~np.isnan(Values))
 
-        if Debug:
-            print(ObsCount, ValidCount)
+        # Actual measurements (excluding NaN)
+        # ValidCount = np.count_nonzero(~np.isnan(Values))
 
         LastDate = Dates[len(Dates) - 1]
         lastDate = datetime.strptime(LastDate, "%Y-%m-%d").date()
 
-        now = datetime.now()
-        nowDate = now.date()
+        # now = datetime.now()
+        # nowDate = now.date()
 
-        #
-        # OVERRIDE For now we keep only full observation sets
-        #
+        if Debug:
+            print(ObsCount, lastDate)
 
-        # if ValidCount == ObsCount:
+        # if Logging:
+        #     check1 = " len > 6"
+        #     logfile.write(dataflow + ' : ' + series_id + check1 + '\n')
 
-        if ObsCount > 6:  # Filter out insufficiently long timeseries
+        # Compute absolute and percent differences for numerical types
+        # print(Data['Field Type'], Values[-10:])
+
+        Diffs = []
+        PDiffs = []
+        Diffs.append(0)
+        PDiffs.append(0)
+
+        for k in range(1, len(Values)):
+            if Data['Field Type'] != 'text':
 
-            if Logging:
-                check1 = " len > 6"
-                logfile.write(dataflow + ' : ' + series_id + check1 + '\n')
-
-            # Compute absolute and percent differences
-            Diffs = []
-            PDiffs = []
-            Diffs.append(0)
-            PDiffs.append(0)
-            for k in range(1, len(Values)):
                 Diffs.append(Values[k] - Values[k - 1])
                 if math.fabs(Values[k - 1]) > 0:
                     PDiffs.append((Values[k] - Values[k - 1]) / Values[k - 1])
                 else:
                     PDiffs.append(0)
+            else:
+                Diffs.append(0)
+                PDiffs.append(0)
 
-            series['Status'] = 'Valid'
-            Data['Dates'] = Dates
-            Data['Values'] = Values
-            Data['Delta'] = Diffs
-            Data['PDelta'] = PDiffs
+        series['Status'] = 'Valid'
+        Data['Dates'] = Dates
+        Data['Values'] = Values
+        Data['Delta'] = Diffs
+        Data['PDelta'] = PDiffs
 
-            # Compute Metrics
-            Metrics = {}
-            Value = np.array(Values)
-            FirstDate = Dates[0]
-            # ObsCount = len(Value)
+        # Compute Metrics
+        Metrics = {}
+        Value = np.array(Values)
+        FirstDate = Dates[0]
+        # ObsCount = len(Value)
 
-            # TODO CONVERT TO UTC DATE
-            Data['LastDate'] = LastDate
+        # TODO CONVERT TO UTC DATE
+        Data['LastDate'] = LastDate
 
+        if Data['Field Type'] != 'text':
             # Compute key risk metrics and recent observations
             max_value = np.max(Value)
             min_value = np.min(Value)
@@ -267,92 +273,88 @@ class Command(BaseCommand):
                 'T-3': round(value_m3, 3),
                 'Orders': val_orders
             }
+        else:
+            Metrics = {
+                'FirstDate': FirstDate,
+                'LastDate': LastDate,
+                'Frequency': Data['Frequency'],
+                'ObsCount': ObsCount,
+                'Min': 0,
+                'Max': 0,
+                'Median': 0,
+                'Q25': 0,
+                'Q75': 0,
+                'Mean': 0,
+                'Vol': 0,
+                'Skew': 0,
+                'Kurtosis': 0,
+                'T': 0,
+                'T-1': 0,
+                'T-2': 0,
+                'T-3': 0,
+                'Orders': 0
+            }
 
-            Data['Metrics'] = Metrics
-
-            if std_value > 0:  # Ignore flat timeseries
-                check2 = " vol > 0"
-                if Logging:
-                    logfile.write(dataflow + ' : ' + series_id + check2 + '\n')
-
-                # Compute Geometry (For Volatility Gauge Visualization)
-
-                # VOLATILITY GAUGE SETTINGS
-                # Map key values onto circle
-                # Average is at 90
-                # 1 std is 30
-                # angle_unit = 3.1415 / 6.0
-                # theta_current = angle_unit * (last_value - mean_value) / std_value
-                # theta_m1 = angle_unit * (value_m1 - mean_value) / std_value
-                # theta_m2 = angle_unit * (value_m2 - mean_value) / std_value
-                # theta_m3 = angle_unit * (value_m3 - mean_value) / std_value
-                # theta_max = angle_unit * (max_value - mean_value) / std_value
-                # theta_min = angle_unit * (min_value - mean_value) / std_value
-
-                # Zero decline / increase is at 90
-                # 20% change is 30 degrees
-                mean_value = 0
-                std_value = 20
-                angle_unit = 3.1415 / 6.0
-                theta_current = angle_unit * (last_value - mean_value) / std_value
-                theta_m1 = angle_unit * (value_m1 - mean_value) / std_value
-                theta_m2 = angle_unit * (value_m2 - mean_value) / std_value
-                theta_m3 = angle_unit * (value_m3 - mean_value) / std_value
-                theta_max = angle_unit * (max_value - mean_value) / std_value
-                theta_min = angle_unit * (min_value - mean_value) / std_value
-
-                Geometry = {'Max': theta_max, 'Min': theta_min, 'Current': theta_current, 'Min1': theta_m1,
-                            'Min2': theta_m2, 'Min3': theta_m3}
-                Data['Geometry_1D'] = Geometry
+        Data['Metrics'] = Metrics
+
+        if Data['Field Type'] != 'text':
+
+            # Compute Geometry (For Volatility Gauge Visualization)
+
+            # VOLATILITY GAUGE SETTINGS
+            # Map key values onto circle
+            # Average is at 90
+            # 1 std is 30
+            # angle_unit = 3.1415 / 6.0
+            # theta_current = angle_unit * (last_value - mean_value) / std_value
+            # theta_m1 = angle_unit * (value_m1 - mean_value) / std_value
+            # theta_m2 = angle_unit * (value_m2 - mean_value) / std_value
+            # theta_m3 = angle_unit * (value_m3 - mean_value) / std_value
+            # theta_max = angle_unit * (max_value - mean_value) / std_value
+            # theta_min = angle_unit * (min_value - mean_value) / std_value
+
+            # Zero decline / increase is at 90
+            # 20% change is 30 degrees
+            mean_value = 0
+            std_value = 20
+            angle_unit = 3.1415 / 6.0
+            theta_current = angle_unit * (last_value - mean_value) / std_value
+            theta_m1 = angle_unit * (value_m1 - mean_value) / std_value
+            theta_m2 = angle_unit * (value_m2 - mean_value) / std_value
+            theta_m3 = angle_unit * (value_m3 - mean_value) / std_value
+            theta_max = angle_unit * (max_value - mean_value) / std_value
+            theta_min = angle_unit * (min_value - mean_value) / std_value
+
+            Geometry = {'Max': theta_max, 'Min': theta_min, 'Current': theta_current, 'Min1': theta_m1,
+                        'Min2': theta_m2, 'Min3': theta_m3}
 
-            else:
-                check2 = " vol = 0"
-                if Logging:
-                    logfile.write(dataflow + ' : ' + series_id + check2 + '\n')
-                series['Status'] = 'Stale Dataset'
         else:
-            check1 = " len < 6"
-            if Logging:
-                logfile.write(dataflow + ' : ' + series_id + check1 + '\n')
-            series['Status'] = 'Insufficient Length Dataset'
-        # else:
-        #     check4 = " Missing Data"
-        #     if Logging:
-        #         logfile.write(dataflow + ' : ' + series_id + check4 + '\n')
-        #     series['Status'] = 'Missing Data'
+            Geometry = {'Max': 0, 'Min': 0, 'Current': 0, 'Min1': 0, 'Min2': 0, 'Min3': 0}
+
+        Data['Geometry_1D'] = Geometry
 
         processing_end = time.time()
         processing_time = round(processing_end - processing_start, 4)
+
         if Debug:
             print(count, len(series_list), processing_time)
 
-        # if Debug:
-        #     print(Data)
-        #     if series['Status'] is 'Valid':
-        #         print(dataflow, " : ", series_id, LastDate, processing_time, " sec")
-        #     else:
-        #         print(dataflow, " : ", series_id, series['Status'], check1, check2, check3, check4)
-        #
-        #
-        output_file = str(datapath) + '/policy/policy_data/dataflows/' + dataflow + '/' + series_id + '.P' + '.json'
+        output_file = str(datapath) + '/dataflows/' + dataflow + '/' + series_id + '.P' + '.json'
+        print('Dumping File: ', output_file)
+        print(Data)
         json.dump(Data, open(output_file, 'w'), sort_keys=True, indent=4, separators=(',', ': '))
         series_list_update.append(series)
+        print('Done')
 
     # store an updated list of dataseries dicts
     json.dump(series_list_update, open(dataseries_list_update_file, 'w'), sort_keys=True, indent=4,
               separators=(',', ': '))
 
-    # Attach series we have not processed
-    # series_list_update.extend(static_list)
-    # if Debug:
-    #     print("Processed :", len(download_list))
-    #     print("Untouched :", len(static_list))
-
     if Logging:
         logfile.write("> Processed  Policy Data  \n")
         logfile.write("> Execution Time: %s seconds --- \n" % (time.time() - start_time))
         logfile.write(80 * '=' + '\n')
         logfile.close()
 
     def handle(self, *args, **options):
-        self.stdout.write(self.style.SUCCESS('Successfully processed policy data'))
+        self.stdout.write(self.style.SUCCESS('Successfully processed covid policy data'))