From 77783154e85673626d5e88aa14d1c0fab9a79d72 Mon Sep 17 00:00:00 2001 From: Ivo Branco Date: Wed, 16 Oct 2024 17:10:09 +0100 Subject: [PATCH] refactor: fix XtraDB, no longer uses intermediate db Fix executing on Percona XtraDB. Remove the need of temporary database. Use temporary tables. Prevent dead locks on XtraDB by executing inside a read transaction. Change execution command. fccn/nau-technical#293 --- .gitignore | 2 +- README.md | 27 +++---- config.ini.sample | 16 +--- export.py | 36 +++++++++ nau.py | 188 +++++++++++++++++----------------------------- report_google.py | 10 --- report_xlsx.py | 17 +---- update_data.py | 20 ----- 8 files changed, 124 insertions(+), 192 deletions(-) create mode 100644 export.py delete mode 100644 update_data.py diff --git a/.gitignore b/.gitignore index 5777e04..1d989ce 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ __pycache__/ venv/ *.xlsx -config.ini +config.ini* diff --git a/README.md b/README.md index 0876d94..c963ec5 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,13 @@ update a Google Sheet file. On the NAU project it is used the second option. The NAU dashboard, based on Google Data Studio, use that Google Sheet has one of its data source. -This project requires an intermediate database on the same engine of the `edxapp` -openedx database. The mysql database user needs a read grant for the `edxapp` and -all grants for its own database. It produces precalculated tables/materialized views -that are 1 to 1 with the xlsx file sheets or each sheet of the google spreadsheet -file, each relevant table is prefixed with the `DATA_` string. +The mysql database user needs a read grant for the `edxapp` database. Those scripts should be run at least once a day, preference after the midnight, so your -Google Sheet file always contain yesterday's data in full. +Google Sheet file always contains the yesterday's data in full. The queries don't have any reference to individual users, and don't have specific -identification numbers, like user id, emails or similar data. +identification numbers, like user id, emails or similar data; so it's GDPR compliant. # Usage @@ -24,7 +20,9 @@ identification numbers, like user id, emails or similar data. - Set the `config.ini` file based on the `config.init.sample`. - Execute `report_xlsx.py` or `report_google.py`. + ### Activate virtual environment and install its dependencies + ```bash virtualenv venv --python=python3 source venv/bin/activate @@ -37,19 +35,12 @@ cp config.init.sample config.ini vim config.ini ``` -### Update precalculated data -To update the precalculated data run: - -```bash -python update_data.py -``` - -### Export has xlsx file +### Export data to a xlsx file ```bash -python report_xlsx.py +python export.py --config config.ini --export xlsx ``` -### Update a Google Sheet +### Export data to a Google Sheet ```bash -python report_google.py +python export.py --config config.ini --export google_sheets ``` diff --git a/config.ini.sample b/config.ini.sample index b6b25ec..10c62a7 100644 --- a/config.ini.sample +++ b/config.ini.sample @@ -7,7 +7,7 @@ password = password database = edxapp [sheets] -# progress = True +progress = True [google_service_account] type = service_account @@ -40,14 +40,6 @@ distinct_users_by_day = xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx distinct_users_by_month = xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx [xlsx] -# File name -file = nau_reports.xlsx -default_date_format = yyyy-mm-dd -export = organizations,course_runs,course_run_by_date,enrollments_with_profile_info,users,distinct_users_by_day,distinct_users_by_month,final_summary - -# Configuration when the `update_data.py` is run. -[data] -# Configure which data should be synchronized / updated -synchronize = organizations,course_runs,course_run_by_date,enrollments_with_profile_info,enrollments_year_of_birth,enrollments_gender,enrollments_level_of_education,enrollments_country,enrollments_employment_situation,users,registered_users_by_day,distinct_users_by_day,distinct_users_by_month -# number of seconds between each query /update so the database can breathe and we don't too much stress. -seconds_between_updates=120 +; file = nau_reports.xlsx +; default_date_format = yyyy-mm-dd +; export = organizations,course_runs,course_run_by_date,enrollments_with_profile_info,enrollments_year_of_birth,enrollments_gender,enrollments_level_of_education,enrollments_country,enrollments_employment_situation,users,registered_users_by_day,distinct_users_by_day,distinct_users_by_month diff --git a/export.py b/export.py new file mode 100644 index 0000000..6fd9458 --- /dev/null +++ b/export.py @@ -0,0 +1,36 @@ +""" +Script that exports data to xlsx or to a Google Sheet. +""" +import argparse +import configparser + +from nau import Reports + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='NAU Open edX Database exporter', + description='Exports to xlsx or to Google Sheet with Open edX DB', + epilog='This program exports to a xlsx file or directly to a Google Sheet information from the Open edX database, so it can be analyze or integrated with dashboard application.', + ) + parser.add_argument('--config', type=argparse.FileType('r'), required=True, help='The path to a config.ini with the required configurations.') + parser.add_argument('--export', required=True, choices=['xlsx','google_sheets'], help='The export mode selected.') + args = parser.parse_args() + + config_file = args.config + config_file_content = config_file.read() + + config = configparser.ConfigParser() + config.read_string(config_file_content) + reports:Reports = Reports(config) + export_mode = args.export + + match export_mode: + case 'xlsx': + from report_xlsx import export_to_xlsx + export_to_xlsx(config, reports) + case 'google_sheets': + from report_google import export_queries_to_google + export_queries_to_google(config, reports) + case _: + raise ValueError(f"Invalid export mode selected {export_mode}") diff --git a/nau.py b/nau.py index 8f1598b..e80344c 100644 --- a/nau.py +++ b/nau.py @@ -1,7 +1,7 @@ from datetime import datetime import mysql.connector import configparser -import time + class DataLink: connection = None @@ -30,13 +30,23 @@ def execute(self, query): def query(self, query): # return a query result set as an list of dicts self._connect() - mycursor = self.connection.cursor() - mycursor.execute(query) - description = mycursor.description - + cursor = self.connection.cursor() + query_with_transaction = f""" +START TRANSACTION READ ONLY; +{query}; +""" + results = cursor.execute(query_with_transaction, multi=True) + + # assuming that only 1 statement returns data + for cur in results: + if cur.with_rows: + cursor = cur + break + + description = cursor.description result = [] - - for row in mycursor.fetchall(): + + for row in cursor.fetchall(): r = {} for idx, column in enumerate(description): r[column[0]] = row[idx] @@ -57,11 +67,8 @@ class Reports: data_link = None config : configparser.ConfigParser = None progress : bool - update_data :bool - def __init__(self, update_data: bool, config: configparser.ConfigParser): - self.update_data = update_data - self.seconds_between_updates = int(config.get('data', 'seconds_between_updates')) + def __init__(self, config: configparser.ConfigParser): settings : dict = {} settings["host"] = config.get('connection', 'host', fallback='localhost') settings["port"] = config.get('connection', 'port', fallback='3306') @@ -69,7 +76,6 @@ def __init__(self, update_data: bool, config: configparser.ConfigParser): settings["user"] = config.get('connection', 'user', fallback='read_only') settings["password"] = config.get('connection', 'password') self.edxapp_database = config.get('connection', 'database', fallback='edxapp') - self.output_database = config.get('connection', 'output_database') debug : bool = config.get('connection', 'debug', fallback=False) if debug: @@ -156,34 +162,10 @@ def _apply_data(self, d:dict): title = d.get('title') if self.progress: print("Producing... " + title) - self._sleep_if_need() return (title, d.get('data')()) - def _sleep_if_need(self): - if self.update_data and self.seconds_between_updates: - time.sleep(self.seconds_between_updates) - - def _create_tmp_table(self, table, query): - if self.update_data: - self.data_link.execute(f""" - DROP TABLE IF EXISTS {self.output_database}.TMP_{table} - """) - self.data_link.execute(f""" - CREATE TABLE {self.output_database}.TMP_{table} AS - """ + query) - - def _create_and_return_table(self, table, query): - if self.update_data: - self._create_tmp_table(table, query) - self.data_link.execute(f""" - DROP TABLE IF EXISTS {self.output_database}.{table} - """) - self.data_link.execute(f""" - RENAME TABLE {self.output_database}.TMP_{table} TO {self.output_database}.{table} - """) - return self.data_link.query(f""" - SELECT * FROM {self.output_database}.{table} - """) + def _create_and_return_table(self, query): + return self.data_link.query(query) def summary(self): return [dict({ @@ -218,7 +200,7 @@ def final_summary(self): })] def organizations(self): - return self._create_and_return_table('DATA_ORGANIZATIONS', f""" + return self._create_and_return_table(f""" SELECT id, created, modified, name, short_name, description, logo, active FROM {self.edxapp_database}.organizations_organization @@ -228,7 +210,7 @@ def course_runs(self): """ Each line is a course run. """ - return self._create_and_return_table('DATA_COURSE_RUNS', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(id, '+', -2), '+', 1) as course_code, @@ -280,68 +262,7 @@ def course_runs(self): """) def course_run_by_date(self): - def create_index_course_id_date(table): - self.data_link.execute(f"CREATE INDEX course_id_date ON {self.output_database}.TMP_{table} (date)") - - if self.update_data: - self._create_tmp_table('COURSE_RUN_BY_DATE_STUDENT_COURSEENROLLMENT', f""" - SELECT - course_id, - DATE_FORMAT(sce.created, "%Y-%m-%d") date, - count(1) as enrollments_count, - 0 as passed, - 0 as certificates_count, - 0 as block_completion_count - FROM {self.edxapp_database}.student_courseenrollment sce - GROUP BY course_id, date - """) - create_index_course_id_date('COURSE_RUN_BY_DATE_STUDENT_COURSEENROLLMENT') - self._sleep_if_need() - - self._create_tmp_table('COURSE_RUN_BY_DATE_GRADES_PERSISTENTCOURSEGRADE', f""" - SELECT - course_id, - DATE_FORMAT(gpg.passed_timestamp, "%Y-%m-%d") AS date, - 0 as enrollments_count, - count(1) as passed, - 0 as certificates_count, - 0 as block_completion_count - FROM {self.edxapp_database}.grades_persistentcoursegrade gpg - WHERE gpg.passed_timestamp is not null - GROUP BY course_id, date - """) - create_index_course_id_date('COURSE_RUN_BY_DATE_GRADES_PERSISTENTCOURSEGRADE') - self._sleep_if_need() - - self._create_tmp_table('COURSE_RUN_BY_DATE_CERTIFICATES_GENERATEDCERTIFICATE', f""" - SELECT - course_id, - DATE_FORMAT(created_date, "%Y-%m-%d") AS date, - 0 as enrollments_count, - 0 as passed, - count(1) AS certificates_count, - 0 as block_completion_count - FROM {self.edxapp_database}.certificates_generatedcertificate - GROUP BY course_id, date - """) - create_index_course_id_date('COURSE_RUN_BY_DATE_CERTIFICATES_GENERATEDCERTIFICATE') - self._sleep_if_need() - - self._create_tmp_table('COURSE_RUN_BY_DATE_COMPLETION_BLOCKCOMPLETION_COUNT', f""" - SELECT - course_key as course_id, - date_format(cbc.created, "%Y-%m-%d") as date, - 0 as enrollments_count, - 0 as passed, - 0 AS certificates_count, - COUNT(1) as block_completion_count - FROM {self.edxapp_database}.completion_blockcompletion cbc - GROUP BY course_key, date - """) - create_index_course_id_date('COURSE_RUN_BY_DATE_COMPLETION_BLOCKCOMPLETION_COUNT') - self._sleep_if_need() - - return self._create_and_return_table('DATA_COURSE_RUN_BY_DATE', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(course_id, '+', -2), '+', 1) as course_code, @@ -372,20 +293,53 @@ def create_index_course_id_date(table): ( SELECT co.social_sharing_url from {self.edxapp_database}.course_overviews_courseoverview co where co.id = t.course_id) as course_marketing_url, ( SELECT co.self_paced from {self.edxapp_database}.course_overviews_courseoverview co where co.id = t.course_id) as self_paced, ( SELECT co.invitation_only from {self.edxapp_database}.course_overviews_courseoverview co where co.id = t.course_id) as invitation_only, - SUM(enrollments_count) as enrollments_count, + SUM(enrollments_count) as enrollments_count, SUM(passed) as passed, SUM(certificates_count) as certificates_count, SUM(block_completion_count) as block_completion_count, (select id from {self.edxapp_database}.course_overviews_courseoverview coc2 where course_code = SUBSTRING_INDEX(SUBSTRING_INDEX(coc2.id, '+', -2), '+', 1) order by created asc limit 1) = course_id as course_run_is_first_edition FROM ( ( - SELECT * FROM {self.output_database}.TMP_COURSE_RUN_BY_DATE_STUDENT_COURSEENROLLMENT + SELECT + course_id, + DATE_FORMAT(sce.created, "%Y-%m-%d") date, + count(1) as enrollments_count, + 0 as passed, + 0 as certificates_count, + 0 as block_completion_count + FROM {self.edxapp_database}.student_courseenrollment sce + GROUP BY course_id, date ) UNION ( - SELECT * FROM {self.output_database}.TMP_COURSE_RUN_BY_DATE_GRADES_PERSISTENTCOURSEGRADE + SELECT + course_id, + DATE_FORMAT(gpg.passed_timestamp, "%Y-%m-%d") AS date, + 0 as enrollments_count, + count(1) as passed, + 0 as certificates_count, + 0 as block_completion_count + FROM {self.edxapp_database}.grades_persistentcoursegrade gpg + WHERE gpg.passed_timestamp is not null + GROUP BY course_id, date ) UNION ( - SELECT * FROM {self.output_database}.TMP_COURSE_RUN_BY_DATE_CERTIFICATES_GENERATEDCERTIFICATE + SELECT + course_id, + DATE_FORMAT(created_date, "%Y-%m-%d") AS date, + 0 as enrollments_count, + 0 as passed, + count(1) AS certificates_count, + 0 as block_completion_count + FROM {self.edxapp_database}.certificates_generatedcertificate + GROUP BY course_id, date ) UNION ( - SELECT * FROM {self.output_database}.TMP_COURSE_RUN_BY_DATE_COMPLETION_BLOCKCOMPLETION_COUNT + SELECT + course_key as course_id, + date_format(cbc.created, "%Y-%m-%d") as date, + 0 as enrollments_count, + 0 as passed, + 0 AS certificates_count, + COUNT(1) as block_completion_count + FROM {self.edxapp_database}.completion_blockcompletion cbc + GROUP BY course_key, date ) ) as t GROUP BY course_id, date @@ -396,7 +350,7 @@ def enrollments_with_profile_info(self): """ Enrollment data with student information """ - return self._create_and_return_table('DATA_ENROLLMENTS_WITH_PROFILE_INFO', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -427,7 +381,7 @@ def enrollments_year_of_birth(self): """ Enrollment data with year of birth """ - return self._create_and_return_table('DATA_ENROLLMENTS_YEAR_OF_BIRTH', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -445,7 +399,7 @@ def enrollments_gender(self): """ Enrollment data with year of birth """ - return self._create_and_return_table('DATA_ENROLLMENTS_GENDER', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -463,7 +417,7 @@ def enrollments_level_of_education(self): """ Enrollment data with year of birth """ - return self._create_and_return_table('DATA_ENROLLMENTS_LEVEL_OF_EDUCATION', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -481,7 +435,7 @@ def enrollments_country(self): """ Enrollment data with year of birth """ - return self._create_and_return_table('DATA_ENROLLMENTS_COUNTRY', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -500,7 +454,7 @@ def enrollments_employment_situation(self): """ Enrollment data with employment situation """ - return self._create_and_return_table('DATA_ENROLLMENTS_EMPLOYMENT_SITUATION', f""" + return self._create_and_return_table(f""" SELECT SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, ':', -1), '+', 1) as org_code, SUBSTRING_INDEX(SUBSTRING_INDEX(sce.course_id, '+', -2), '+', 1) as course_code, @@ -515,7 +469,7 @@ def enrollments_employment_situation(self): """) def users(self): - return self._create_and_return_table('DATA_USERS', f""" + return self._create_and_return_table(f""" SELECT date_format(date_joined, "%Y-%m-%d") as register_date, au.is_active, @@ -532,7 +486,7 @@ def users(self): """) def registered_users_by_day(self): - return self._create_and_return_table('DATA_REGISTERED_USERS_BY_DAY', f""" + return self._create_and_return_table(f""" SELECT register_date, sum(active) as total_active, sum(total) as total, sum(enrollment_count) as enrollment_count FROM ( @@ -574,7 +528,7 @@ def distinct_users_by_day(self): """ This gives the number of users that have learn by day """ - return self._create_and_return_table('DATA_DISTINCT_USERS_BY_DAY', f""" + return self._create_and_return_table(f""" SELECT DATE_FORMAT(created, "%Y-%m-%d") date, COUNT(distinct user_id) as users FROM {self.edxapp_database}.completion_blockcompletion cbc GROUP BY date @@ -584,7 +538,7 @@ def distinct_users_by_month(self): """ Number of users that have learn on the platform by month """ - return self._create_and_return_table('DATA_DISTINCT_USERS_BY_MONTH', f""" + return self._create_and_return_table(f""" SELECT DATE_FORMAT(created, "%Y-%m") date, COUNT(distinct user_id) as users FROM {self.edxapp_database}.completion_blockcompletion cbc GROUP BY date diff --git a/report_google.py b/report_google.py index ebe24ba..f0c5730 100644 --- a/report_google.py +++ b/report_google.py @@ -71,13 +71,3 @@ def export_queries_to_google(config : configparser.ConfigParser, report:Reports) # Close connection to Google Cloud gc.session.close() - -def main(): - config = configparser.ConfigParser() - config.read('config.ini') - nau_reports = Reports(False, config) - export_queries_to_google(config, nau_reports) - - -if __name__ == "__main__": - main() diff --git a/report_xlsx.py b/report_xlsx.py index 65c43e9..7434666 100644 --- a/report_xlsx.py +++ b/report_xlsx.py @@ -30,8 +30,7 @@ def xlsx_worksheet(data, worksheet): row += 1 -def xlsx_export_queries(config : configparser.ConfigParser, report:Reports): - +def export_to_xlsx(config : configparser.ConfigParser, report:Reports): file_name : str = config.get('xlsx', 'file', fallback='report.xlsx') default_date_format : str = config.get('xlsx', 'default_date_format', fallback='yyyy-mm-dd') workbook = xlsxwriter.Workbook(file_name, {'default_date_format': default_date_format}) @@ -41,18 +40,8 @@ def xlsx_export_queries(config : configparser.ConfigParser, report:Reports): for sheet_key in sheets_to_export_keys: sheets_results = report.sheets_data([sheet_key]) for sheet_title, sheet_result in sheets_results: - worksheet = workbook.add_worksheet(sheet_title) + # xlsx supports max of 31 characters on sheet title + worksheet = workbook.add_worksheet(sheet_title[:31]) xlsx_worksheet(sheet_result, worksheet) workbook.close() - - -def main(): - config = configparser.ConfigParser() - config.read('config.ini') - nau_reports = Reports(False, config) - xlsx_export_queries(config, nau_reports) - - -if __name__ == "__main__": - main() diff --git a/update_data.py b/update_data.py deleted file mode 100644 index cff6f17..0000000 --- a/update_data.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -Script that exports to a single xlsx file all the data relevant to be sent to Google Cloud. -""" -import configparser - -from nau import Reports - - -def main(): - config = configparser.ConfigParser() - config.read('config.ini') - report:Reports = Reports(True, config) - to_sync_keys = config.get('data', 'synchronize', fallback=','.join(report.available_sheets_to_export_keys())).split(',') - - for key in to_sync_keys: - report.sheets_data([key]) - - -if __name__ == "__main__": - main()