From cedcc9fb53429c1c58484ed9777cdbcbb8b24f42 Mon Sep 17 00:00:00 2001 From: John Davis Date: Wed, 3 Jul 2024 19:14:21 -0400 Subject: [PATCH 1/2] Add username and email deduplication scripts --- lib/galaxy/managers/users.py | 7 +- lib/galaxy/model/scripts/dedup_emails.py | 24 ++++++ lib/galaxy/model/scripts/dedup_usernames.py | 24 ++++++ lib/galaxy/model/scripts/user_deduplicator.py | 81 +++++++++++++++++++ 4 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 lib/galaxy/model/scripts/dedup_emails.py create mode 100644 lib/galaxy/model/scripts/dedup_usernames.py create mode 100644 lib/galaxy/model/scripts/user_deduplicator.py diff --git a/lib/galaxy/managers/users.py b/lib/galaxy/managers/users.py index f2d26e538b03..fcbfa6f7c2e8 100644 --- a/lib/galaxy/managers/users.py +++ b/lib/galaxy/managers/users.py @@ -908,8 +908,13 @@ def username_exists(session, username: str, model_class=User): def generate_next_available_username(session, username, model_class=User): + """Generate unique username; user can change it later""" + return generate_next_available_username_with_connection(session.connection(), username, model_class) + + +def generate_next_available_username_with_connection(connection, username, model_class=User): """Generate unique username; user can change it later""" i = 1 - while session.execute(select(model_class).where(model_class.username == f"{username}-{i}")).first(): + while connection.execute(select(model_class).where(model_class.username == f"{username}-{i}")).first(): i += 1 return f"{username}-{i}" diff --git a/lib/galaxy/model/scripts/dedup_emails.py b/lib/galaxy/model/scripts/dedup_emails.py new file mode 100644 index 000000000000..0de3444652b6 --- /dev/null +++ b/lib/galaxy/model/scripts/dedup_emails.py @@ -0,0 +1,24 @@ +import os +import sys + +from sqlalchemy import create_engine + +sys.path.insert( + 1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, os.pardir, "lib")) +) + +from galaxy.model.orm.scripts import get_config +from galaxy.model.scripts.user_deduplicator import UserDeduplicator + +DESCRIPTION = "Deduplicate user emails in galaxy_user table" + + +def main(): + config = get_config(sys.argv, use_argparse=False, cwd=os.getcwd()) + engine = create_engine(config["db_url"]) + ud = UserDeduplicator(engine=engine) + ud.deduplicate_emails() + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/model/scripts/dedup_usernames.py b/lib/galaxy/model/scripts/dedup_usernames.py new file mode 100644 index 000000000000..8a4aa1765ad6 --- /dev/null +++ b/lib/galaxy/model/scripts/dedup_usernames.py @@ -0,0 +1,24 @@ +import os +import sys + +from sqlalchemy import create_engine + +sys.path.insert( + 1, os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, os.pardir, "lib")) +) + +from galaxy.model.orm.scripts import get_config +from galaxy.model.scripts.user_deduplicator import UserDeduplicator + +DESCRIPTION = "Deduplicate usernames in galaxy_user table" + + +def main(): + config = get_config(sys.argv, use_argparse=False, cwd=os.getcwd()) + engine = create_engine(config["db_url"]) + ud = UserDeduplicator(engine=engine) + ud.deduplicate_usernames() + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/model/scripts/user_deduplicator.py b/lib/galaxy/model/scripts/user_deduplicator.py new file mode 100644 index 000000000000..07adab54ad45 --- /dev/null +++ b/lib/galaxy/model/scripts/user_deduplicator.py @@ -0,0 +1,81 @@ +from sqlalchemy import ( + func, + select, + update, +) +from sqlalchemy.engine import ( + Connection, + Result, +) + +from galaxy.managers.users import generate_next_available_username_with_connection +from galaxy.model import User + + +class UserDeduplicator: + + def __init__(self, engine): + self.engine = engine + + def deduplicate_emails(self) -> None: + with self.engine.begin() as connection: + prev_email = None + for id, email, _ in self._get_duplicate_email_data(): + if email == prev_email: + new_email = self._generate_replacement_for_duplicate_email(connection, email) + stmt = update(User).where(User.id == id).values(email=new_email) + connection.execute(stmt) + else: + prev_email = email + + def deduplicate_usernames(self) -> None: + with self.engine.begin() as connection: + prev_username = None + for id, username, _ in self._get_duplicate_username_data(): + if username == prev_username: + new_username = generate_next_available_username_with_connection( + connection, username, model_class=User + ) + stmt = update(User).where(User.id == id).values(username=new_username) + connection.execute(stmt) + else: + prev_username = username + + def _get_duplicate_username_data(self) -> Result: + counts = select(User.username, func.count()).group_by(User.username).having(func.count() > 1) + sq = select(User.username).select_from(counts.cte()) + stmt = ( + select(User.id, User.username, User.create_time) + .where(User.username.in_(sq)) + .order_by(User.username, User.create_time.desc()) + ) + + with self.engine.connect() as conn: + duplicates = conn.execute(stmt) + return duplicates + + def _get_duplicate_email_data(self) -> Result: + counts = select(User.email, func.count()).group_by(User.email).having(func.count() > 1) + sq = select(User.email).select_from(counts.cte()) + stmt = ( + select(User.id, User.email, User.create_time) + .where(User.email.in_(sq)) + .order_by(User.email, User.create_time.desc()) + ) + + with self.engine.connect() as conn: + duplicates = conn.execute(stmt) + return duplicates + + def _generate_replacement_for_duplicate_email(self, connection: Connection, email: str) -> str: + """ + Generate a replacement for a duplicate email value. The new value consists of the original email + and a suffix. This value cannot be used as an email, but since the original email is part of + the new value, it will be possible to retrieve the user record based on this value, if needed. + This is used to remove duplicate email values from the database, for the rare case the database + contains such values. + """ + i = 1 + while connection.execute(select(User).where(User.email == f"{email}-{i}")).first(): + i += 1 + return f"{email}-{i}" From 7090b47ba993614f8f73038fc9cc16dfd8b2910a Mon Sep 17 00:00:00 2001 From: John Davis Date: Wed, 3 Jul 2024 19:27:56 -0400 Subject: [PATCH 2/2] Add script entry points --- packages/data/setup.cfg | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/data/setup.cfg b/packages/data/setup.cfg index 4d8cee887fd9..827fe80396bf 100644 --- a/packages/data/setup.cfg +++ b/packages/data/setup.cfg @@ -72,7 +72,9 @@ console_scripts = galaxy-load-objects = galaxy.model.store.load_objects:main galaxy-manage-db = galaxy.model.orm.scripts:manage_db galaxy-prune-histories = galaxy.model.scripts:prune_history_table + galaxy-dedup-usernames = galaxy.model.scripts:dedup_usernames + galaxy-dedup-emails = galaxy.model.scripts:dedup_emails [options.packages.find] exclude = - tests* \ No newline at end of file + tests*