new spec export clean

NYPL · Jan 17, 2024 · 1d8b658 · 1d8b658
1 parent 5213807
commit 1d8b658
Show file tree

Hide file tree

Showing 3 changed files with 248 additions and 0 deletions.
diff --git a/ami_scripts/clean_spec_csv_to_excel.py b/ami_scripts/clean_spec_csv_to_excel.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import json
+import pandas as pd
+import re
+import chardet
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Prep CMS Excel for Import into AMIDB')
+    parser.add_argument('-s', '--source',
+                        help='path to the source XLSX', required=True)
+    parser.add_argument('-w', '--workorder',
+                        help='Work Order ID to apply to new XLSX', required=False)
+    parser.add_argument('-p', '--projectcode',
+                    help='Project Code to apply to new XLSX', required=True)
+    parser.add_argument('-d', '--destination',
+                        help='path to the output directory', required=False)
+    parser.add_argument('-c', '--config',
+                        help='path to the config file', default='config.json', required=False)
+    parser.add_argument('-v', '--vendor',
+                        help='Use vendor mode (skips certain cleanup steps and uses default Excel writer)',
+                        action='store_true') 
+    args = parser.parse_args()
+    return args
+
+def detect_encoding(file_path):
+    with open(file_path, 'rb') as file:
+        return chardet.detect(file.read())['encoding']
+
+def determine_type_format(row):
+    audio_film_formats = [
+        "16mm film, optical track",
+        "16mm film, full-coat magnetic sound",
+        "35mm film, optical track",
+        "35mm film, full-coat magnetic sound"
+    ]
+
+    format_lower = row['format_1'].lower()
+    if format_lower == 'video':
+        return row['format_2'], row['format_3'], ''  # Empty string for video
+    elif format_lower == 'sound recording':
+        return row['format_2'], row['format_3'], '1'  # 1 for sound recordings
+    elif format_lower == 'film':
+        # Check format_2 for specific audio film formats
+        if row['format_2'] in audio_film_formats:
+            return row['format_1'], row['format_2'], '1'
+        else:
+            return row['format_1'], row['format_2'], ''  # Empty string for other formats
+    else:
+        return None, None, ''
+
+
+def map_division_code(vernacular_code):
+    mapping = {
+        'SCL': 'scb',
+        'DAN': 'myd',
+        'RHA': 'myh',
+        'MUS': 'mym',
+        'TOFT': 'myt',
+        'THE': 'myt',
+        'MSS': 'mao',
+        'GRD': 'grd',
+        'NYPLarch': 'axv',
+        'MUL': 'mul',
+        'BRG': 'mae',
+        'JWS': 'maf',
+        'LPA': 'myr'
+
+    }
+    return mapping.get(vernacular_code, '')  # Return empty string if no match
+
+
+def map_csv_columns(df):
+
+    # Standard column mappings
+    column_mapping = {
+        'ref_ami_id': 'bibliographic.primaryID',
+        'id_label_text': 'bibliographic.title',
+        'id.classmark': 'bibliographic.classmark',
+        'bnumber': 'bibliographic.catalogBNumber',
+        'id.legacy': 'bibliographic.formerClassmark',
+        'division': 'bibliographic.vernacularDivisionCode',
+        'ref_collection_id': 'bibliographic.cmsCollectionID',
+        'name_d_calc': 'Archival box number',
+        'date': 'bibliographic.date',
+        'group': 'bibliographic.group',
+        'sequence': 'bibliographic.sequence',
+        'notes.content': 'bibliographic.contentNotes',
+        'notes.preservation': 'source.notes.physicalConditionPreShipNotes',
+        'notes': 'bibliographic.accessNotes',
+        'manufacturer': 'source.physicalDescription.stockManufacturer',
+        'shrinkage': 'source.physicalDescription.shrinkage.measure',
+        'basematerial': 'source.physicalDescription.baseMaterial',
+        'acetate_decay_level': 'source.physicalDescription.acetateDecayLevel',
+        'colorbw': 'source.contentSpecifications.colorBW',
+        'edgecode': 'source.physicalDescription.edgeCode',
+        'film_element': 'source.object.filmElement',
+        'condition_fading': 'source.physicalDescription.conditionfading',
+        'condition_scratches': 'source.physicalDescription.conditionscratches',
+        'condition_splices': 'source.physicalDescription.conditionsplices',
+        'condition_perforation_damage': 'source.physicalDescription.conditionperforationdamage',
+        'condition_distortion': 'source.physicalDescription.conditiondistortion',
+        'fps': 'source.contentSpecifications.frameRate.measure',
+        'generation': 'source.object.generation',
+        'length_ft': 'source.physicalDescription.length.measure',
+        'emulsion_position': 'source.physicalDescription.emulsionPosition',
+        'aspect_ratio': 'source.contentSpecifications.displayAspectRatio',
+        'diameter': 'source.physicalDescription.diameter.measure'
+    }
+
+    # Check context for the id_barcode column and map accordingly
+    barcode_columns = df.filter(like='id_barcode').columns
+    for col in barcode_columns:
+        barcode_index = df.columns.get_loc(col)
+        if df.columns[barcode_index - 1] == 'ref_ami_id':
+            # id_barcode next to ref_ami_id refers to item
+            column_mapping[col] = 'bibliographic.barcode'
+        elif df.columns[barcode_index - 1] == 'name_d_calc':
+            # id_barcode next to name_d_calc refers to archival box
+            column_mapping[col] = 'Archival box barcode'
+
+
+    # Apply the function to determine type, format, and faceNumber
+    df['source.object.type'], df['source.object.format'], df['source.subObject.faceNumber'] = zip(*df.apply(determine_type_format, axis=1))
+
+    # Drop the original format columns as they are no longer needed
+    df.drop(['format_1', 'format_2', 'format_3'], axis=1, inplace=True)
+
+    # Rename columns based on mapping
+    df.rename(columns=column_mapping, inplace=True)
+
+    # Map vernacularDivisionCode to divisionCode
+    df['bibliographic.divisionCode'] = df['bibliographic.vernacularDivisionCode'].apply(map_division_code)
+
+    # Drop unneeded columns
+    unneeded_columns = ['_account.entered', '_dtentered', 'cat_item_record_id',
+                        'ref_acq_id', 'title', 'ux_loc_active_d', 'desc.catY', 
+                        'cm.trans.type', 'cm.trans.dont', 'cm.de.recY', 
+                        'cm.de.rationale', 'time', 'condition_average',
+                        '_inspected_y', '_inspected_by', '_inspected_dt', 
+                        '_inspected_time', 'batch.status', 'migration_status']
+    df.drop(unneeded_columns, axis=1, inplace=True)
+
+    df['asset.schemaVersion'] = '2.0.0'
+    df['asset.fileRole'] = 'pm'
+    df['source.object.volumeNumber'] = 1
+
+    return df
+
+
+def read_config(config_path):
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+
+def replace_characters(df, replacements):
+    for column in df:
+        for replacement in replacements:
+            old_char = replacement['find']
+            new_char = replacement['replace']
+            df[column] = df[column].apply(lambda x: re.sub(old_char, new_char, x) if isinstance(x, str) else x)
+
+
+def apply_format_fixes(df, format_fixes):
+    for target_type, formats in format_fixes.items():
+        for fmt in formats:
+            df.loc[df['source.object.format'] == fmt, 'source.object.type'] = target_type
+
+
+def cleanup_csv(args):
+    if args.source:
+        csv_name = os.path.basename(args.source)
+        clean_name = os.path.splitext(csv_name)[0] + '_CLEAN.xlsx'
+
+        # Detect file encoding
+        file_encoding = detect_encoding(args.source)
+
+        df = pd.read_csv(args.source, encoding=file_encoding)
+
+        df = map_csv_columns(df)
+        config = read_config(args.config)
+        replace_characters(df, config['replacements'])
+        apply_format_fixes(df, config['format_fixes'])
+
+        # Assign the project code to all rows in the new column
+        df['bibliographic.projectCode'] = args.projectcode
+
+        # Sort the DataFrame by 'bibliographic.primaryID'
+        df.sort_values(by='bibliographic.primaryID', inplace=True)
+
+        if args.workorder:
+            df['WorkOrderId'] = args.workorder
+
+        if args.vendor:
+            # Convert to string and format for filename construction
+            temp_volume_number = 'v0' + df['source.object.volumeNumber'].astype(str)
+            temp_face_number = df['source.subObject.faceNumber'].fillna('')
+            temp_face_number = temp_face_number.apply(lambda x: 'f' + str(x).zfill(2) if x else '')
+
+            # Concatenate to create 'Filename (reference)' column
+            df['Filename (reference)'] = df['bibliographic.divisionCode'].astype(str) + '_' + \
+                                        df['bibliographic.primaryID'].astype(str) + '_' + \
+                                        temp_volume_number + \
+                                        temp_face_number + '_' + \
+                                        df['asset.fileRole'].astype(str)
+
+
+        # Now sort the columns alphabetically
+        df = df.reindex(sorted(df.columns), axis=1)
+        # Reset the index to get a clean sequential index
+        df.reset_index(drop=True, inplace=True)
+
+        if args.destination:
+            if os.path.exists(args.destination):
+                output_file_path = os.path.join(args.destination, clean_name)
+                if args.vendor:  
+                    df.to_excel(output_file_path, sheet_name='Sheet1', index=False)  
+                else: 
+                    writer = pd.ExcelWriter(output_file_path, engine='xlsxwriter')
+                    df.to_excel(writer, sheet_name='Sheet1')
+                    writer.close()
+
+def main():
+    arguments = get_args()
+    cleanup_csv(arguments)
+
+if __name__ == '__main__':
+    main()
+    exit(0)
+
+
diff --git a/ami_scripts/config.json b/ami_scripts/config.json
@@ -96,6 +96,20 @@
       "U-matic/PCM",
       "VHS/PCM",
       "Hi8/PCM"
+    ],
+    "audio optical disc": [
+      "Audio CD-R",
+      "Audio CD, pressed",
+      "Minidisc"
+    ],
+    "video optical disc": [
+      "Blu-Ray",
+      "Laser Disc",
+      "VCD",
+      "Video DVD-R",
+      "Video DVD, pressed",
+      "Video DVD+R",
+      "Video DVD+RW"
     ]
   },
   "digitizers": {

diff --git a/ami_scripts/clean_cms_excel.py → ami_scripts/old_scripts/clean_cms_excel.py b/ami_scripts/clean_cms_excel.py → ami_scripts/old_scripts/clean_cms_excel.py