Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuzzy and walrus #5

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions 2024-05_school_mapping/fuxxy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import pandas as pd
from fuzzywuzzy import fuzz
import time


# Define file paths (replace with your actual paths)
file_a_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\Book3.csv"
file_b_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_list_B.tsv"
output_file_path = "E:\\incubator-main\\incubator-main\\2024-05_school_mapping\\data\\school_matches.csv"

# Define school name column names (replace with actual names)
school_column_name_a = "velthuis2" ## from excel break down the column into multiple chunks. only for TSV A. So that, the we can match names only.
school_column_name_b = "name"

# Read data using pandas.read_csv (assuming TSV format)
try:
df_a = pd.read_csv(file_a_path)
df_b = pd.read_csv(file_b_path, sep="\t")
except FileNotFoundError:
print("Error: One or both files not found. Please check the paths.")
exit()

# Handle case-sensitivity issues (optional)
df_a[school_column_name_a] = df_a[school_column_name_a].str.lower()
df_b[school_column_name_b] = df_b[school_column_name_b].str.lower()

# Create an empty list to store matches with confidence scores
matches = []
start_time = time.time()

# Get total number of comparisons to estimate time
total_comparisons = len(df_a) * len(df_b)
processed_comparisons = 0

# Iterate through each school name in df_a
for a_id, a_name in zip(df_a['school_id'], df_a[school_column_name_a]):
# if processed_comparisons >= 100:
# break
for b_id, b_name in zip(df_b['school_id'], df_b[school_column_name_b]):
processed_comparisons += 1

# Calculate partial ratio score
score = fuzz.partial_ratio(a_name, b_name)

# Set a minimum score threshold (adjust as needed)
if score >= 100:
match = {"school_a_id": a_id, "school_a": a_name, "school_b_id": b_id, "school_b": b_name, "confidence_score": score}
matches.append(match)

if processed_comparisons % 100 == 1: # Adjust the reporting frequency as needed
elapsed_time = time.time() - start_time
avg_time_per_comparison = elapsed_time / processed_comparisons
remaining_comparisons = total_comparisons - processed_comparisons
estimated_remaining_time = remaining_comparisons * avg_time_per_comparison
remaining_hours = int(estimated_remaining_time // 3600)
remaining_minutes = int((estimated_remaining_time % 3600) // 60)
remaining_seconds = int(estimated_remaining_time % 60)

print(f"Processed {processed_comparisons}/{total_comparisons} comparisons")
print(f"Estimated remaining time: {remaining_hours} hours, {remaining_minutes} minutes, {remaining_seconds} seconds")

# Create a DataFrame from the matches list
if matches:
df_matches = pd.DataFrame(matches)

# Write the DataFrame to a CSV file
df_matches.to_csv(output_file_path, index=False)
print(f"Matches with confidence scores written to: {output_file_path}")
else:
print("No matches found between the two files.")
71 changes: 71 additions & 0 deletions 2024-05_school_mapping/walrus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import csv
from walrus import Database
import re
import redis


# Initialize the database and create the search index.
redis_client = redis.StrictRedis()

# Flush the entire database to clear all keys.
# There were multiple runs, so it confused the redis
redis_client.flushdb()

# Initialize the database.
db = Database()

search_index = db.Index('app-search')
phonetic_index = db.Index('phonetic-search', metaphone=True)

# Read the data from the first TSV file and index it.
with open(r"..//school_list_A.tsv", 'r', encoding='utf-8') as file1:
reader = csv.DictReader(file1, delimiter='\t')
print("Starting...")

for row in reader:

doc_id = row['school_id']
content = row['velthuis']
document_key = f"{doc_id}_search" # Modify the document key based on your requirements
document = {'content': content, 'id': doc_id} # Include the 'id' inside the document
search_index.add(document_key, **document)

document_key = f"{doc_id}_phonetic" # Modify the document key based on your requirements
phonetic_index.add(document_key, **document)
print(doc_id, content)
print("Ending...")
def sanitize_query(query):
# Remove special characters
sanitized_query = re.sub(r'[^\w\s]', '', query)
return sanitized_query

# Read the list of schools from the second TSV file.
with open(r"..//school_list_B.tsv", 'r',encoding='utf-8') as file2:
reader = csv.DictReader(file2, delimiter='\t')
print("Starting to match")
for row in reader:
school_name = row['name']
print("trying to match", school_name)

sanitized_school_name = sanitize_query(school_name)
try:
match_found = False

for document in search_index.search(school_name):
doc_id = document['id']
school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string
print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
match_found = True

for document in phonetic_index.search(school_name):
doc_id = document['id']
school_name_redis = redis_client.get(doc_id).decode('utf-8') # Assuming school name is stored as UTF-8 string
print(f"School Name: {school_name_redis}, Content: {document['content']}, ID: {doc_id}")
match_found = True

if match_found:
# Add the school ID from the TSV file into the Redis table as a new key-value pair.
redis_client.set(f"matched_{doc_id}", school_name)
except Exception as e:
print(f"Error matching {school_name}: {e}")
print(f"Resuming next school name")