diff --git a/README.rst b/README.rst index d3ecabb..da5469f 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ pgantomizer =========== Anonymize data in your PostgreSQL dababase with ease. Anonymization is handy if you need to provide data to -people that should not have access to the personal information of the users. +people that should not have access to the personal information of the users. Importing the data to third-party tools where you cannot guarantee what will happen to the data is also a common use case. @@ -10,7 +10,7 @@ Anonymization Process --------------------- The rules for anonynimization are written in a single YAML file. -Columns that should be left in the raw form without anonymization must be explicitly marked in the schema. +Columns that should be left in the raw form without anonymization must be explicitly marked in the schema. This ensures that adding the new column in the DB without thinking about its sensitivity does not leak the data. The default name of the primary key is `id` but a custom one can be specified form the table in the schema. @@ -40,8 +40,10 @@ Finally, the dump file is deleted by default to reduce risk of leakage of unanon Calling pgantomizer from Python ------------------------------- -You can call the functions to dump anonymize the data from Python. +You can call the functions to dump anonymize the data from Python. Please, look at the `dump_db` and `load_anonymize_remove` in the code. +If you are only after anonymizing an existing database, there is a function `anonymize_db` +that will help you do that. To help integrating the code in complex environments such as a horde of Docker containers, all database-related arguments can be supplied as environmental variables. @@ -51,3 +53,4 @@ TODO * expand this README * add automated tests (TravisCI) * submit package automatically to PyPI +* add --dry-run argument that will check the schema and output the operations to be performed diff --git a/pgantomizer/anonymize.py b/pgantomizer/anonymize.py index 7c017d4..1faed78 100644 --- a/pgantomizer/anonymize.py +++ b/pgantomizer/anonymize.py @@ -51,6 +51,9 @@ def drop_schema(db_args): def load_db_to_new_instance(filename, db_args): + if not os.path.isfile(filename): + raise IOError('Dump file {}'.format(filename)) + os.putenv('PGPASSWORD', db_args.get('password')) drop_schema(db_args) subprocess.run( 'PGPASSWORD={password} pg_restore -Fc -j 8 {db_args} {filename} {redirect}'.format( @@ -75,7 +78,7 @@ def prepare_column_for_anonymization(conn, cursor, table, column, data_type): conn.commit() -def check_schema(cursor, schema): +def check_schema(cursor, schema, db_args): for table in schema: try: cursor.execute("SELECT {columns} FROM {table};".format( @@ -84,7 +87,7 @@ def check_schema(cursor, schema): )) except psycopg2.ProgrammingError: logging.warning('Some of the columns specified in the schema do not exist in the dump.') - drop_schema() + drop_schema(db_args) raise @@ -106,7 +109,7 @@ def anonymize_column(cursor, schema, table, column, data_type): def anonymize_db(schema, db_args): with psycopg2.connect(**db_args) as conn: with conn.cursor() as cursor: - check_schema(cursor, schema) + check_schema(cursor, schema, db_args) cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';") for table_name in cursor.fetchall(): cursor.execute("SELECT column_name, data_type FROM information_schema.columns " diff --git a/pgantomizer/dump.py b/pgantomizer/dump.py index 6e4bcdf..d984445 100644 --- a/pgantomizer/dump.py +++ b/pgantomizer/dump.py @@ -9,8 +9,10 @@ def dump_db(dump_path, schema_path, password='', *db_args): schema = yaml.load(open(schema_path)) + password = password or os.environ.get('DB_DEFAULT_PASS', '') + os.putenv('PGPASSWORD', password) cmd = 'PGPASSWORD={password} pg_dump -Fc -Z 9 {args} {tables} -f {filename}'.format( - password=password or os.environ.get('DB_DEFAULT_PASS', ''), + password=password, args='-d {} -U {} -h {} -p {} '.format( *(db_args or [os.environ.get(var) for var in ['DB_DEFAULT_NAME', 'DB_DEFAULT_USER', 'DB_DEFAULT_SERVICE', 'DB_DEFAULT_PORT']])),