moest-np · Wahesh · May 31, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/2024-05_school_mapping/fuxxy.py b/2024-05_school_mapping/fuxxy.py
@@ -0,0 +1,70 @@
+import pandas as pd
+from fuzzywuzzy import fuzz
+import time
+
+
+# Define file paths (replace with your actual paths)
+file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv"  
+file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv"
+output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv"  
+
+# Define school name column names (replace with actual names)
+school_column_name_a = "velthuis2"   ## from excel break down the column into multiple chunks. only for TSV A. So that, the we can match names only. 
+school_column_name_b = "name"  
+
+# Read data using pandas.read_csv (assuming TSV format)
+try:
+    df_a = pd.read_csv(file_a_path)
+    df_b = pd.read_csv(file_b_path, sep="\t")
+except FileNotFoundError:
+    print("Error: One or both files not found. Please check the paths.")
+    exit()
+
+# Handle case-sensitivity issues (optional)
+df_a[school_column_name_a] = df_a[school_column_name_a].str.lower()  
+df_b[school_column_name_b] = df_b[school_column_name_b].str.lower()  
+
+# Create an empty list to store matches with confidence scores
+matches = []
+start_time = time.time()
+
+# Get total number of comparisons to estimate time
+total_comparisons = len(df_a) * len(df_b)
+processed_comparisons = 0
+
+# Iterate through each school name in df_a
+for a_id, a_name in zip(df_a['school_id'], df_a[school_column_name_a]):
+    # if processed_comparisons >= 100:
+    #     break
+    for b_id, b_name in zip(df_b['school_id'], df_b[school_column_name_b]):
+        processed_comparisons += 1
+
+        # Calculate partial ratio score
+        score = fuzz.partial_ratio(a_name, b_name)
+
+        # Set a minimum score threshold (adjust as needed)
+        if score >= 100:
+            match = {"school_a_id": a_id, "school_a": a_name, "school_b_id": b_id, "school_b": b_name, "confidence_score": score}
+            matches.append(match)
+
+            if processed_comparisons % 100 == 1:  # Adjust the reporting frequency as needed
+                elapsed_time = time.time() - start_time
+                avg_time_per_comparison = elapsed_time / processed_comparisons
+                remaining_comparisons = total_comparisons - processed_comparisons
+                estimated_remaining_time = remaining_comparisons * avg_time_per_comparison
+                remaining_hours = int(estimated_remaining_time // 3600)
+                remaining_minutes = int((estimated_remaining_time % 3600) // 60)
+                remaining_seconds = int(estimated_remaining_time % 60)
+
+                print(f"Processed {processed_comparisons}/{total_comparisons} comparisons")
+                print(f"Estimated remaining time: {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds")
+
+# Create a DataFrame from the matches list
+if matches:
+    df_matches = pd.DataFrame(matches)
+
+    # Write the DataFrame to a CSV file
+    df_matches.to_csv(output_file_path, index=False)
+    print(f"Matches with confidence scores written to: {output_file_path}")
+else:
+    print("No matches found between the two files.")
diff --git a/2024-05_school_mapping/walrus.py b/2024-05_school_mapping/walrus.py
@@ -0,0 +1,71 @@
+import csv
+from walrus import Database
+import re
+import redis
+
+
+# Initialize the database and create the search index.
+redis_client = redis.StrictRedis()
+
+# Flush the entire database to clear all keys. 
+# There were multiple runs, so it confused the redis
+redis_client.flushdb()
+
+# Initialize the database.
+db = Database()
+
+search_index = db.Index('app-search')
+phonetic_index = db.Index('phonetic-search', metaphone=True)
+
+# Read the data from the first TSV file and index it.
+with open(r"..//school_list_A.tsv", 'r', encoding='utf-8') as file1:
+    reader = csv.DictReader(file1, delimiter='\t')
+    print("Starting...")
+
+    for row in reader:
+
+        doc_id = row['school_id']
+        content = row['velthuis']
+        document_key = f"{doc_id}_search"  # Modify the document key based on your requirements
+        document = {'content': content, 'id': doc_id}  # Include the 'id' inside the document
+        search_index.add(document_key, **document)
+
+        document_key = f"{doc_id}_phonetic"  # Modify the document key based on your requirements
+        phonetic_index.add(document_key, **document)
+        print(doc_id, content)
+print("Ending...")
+def sanitize_query(query):
+    # Remove special characters
+    sanitized_query = re.sub(r'[^\w\s]', '', query)
+    return sanitized_query
+
+# Read the list of schools from the second TSV file.
+with open(r"..//school_list_B.tsv", 'r',encoding='utf-8') as file2:
+    reader = csv.DictReader(file2, delimiter='\t')
+    print("Starting to match")
+    for row in reader:
+        school_name = row['name']
+        print("trying to match", school_name)
+
+        sanitized_school_name = sanitize_query(school_name)
+        try:
+            match_found = False
+
+            for document in search_index.search(school_name):
+                doc_id = document['id']
+                school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
+                print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
+                match_found = True
+
+            for document in phonetic_index.search(school_name):
+                doc_id = document['id']
+                school_name_redis = redis_client.get(doc_id).decode('utf-8')  # Assuming school name is stored as UTF-8 string
+                print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
+                match_found = True
+
+            if match_found:
+                # Add the school ID from the TSV file into the Redis table as a new key-value pair.
+                redis_client.set(f"matched_{doc_id}", school_name)
+                    except Exception as e:
+        print(f"Error matching {school_name}: {e}")
+        print(f"Resuming next school name")