Skip to content

Commit

Permalink
test encore le coup du CSV
Browse files Browse the repository at this point in the history
  • Loading branch information
vperron committed Nov 19, 2024
1 parent 896e1e4 commit e46332d
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions deduplication/src/02_create_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def cluster_rows(clustered_dupes):

with write_con.cursor() as write_cur:
write_cur.copy_expert(
"COPY dedupe_predicates FROM STDIN WITH CSV",
"COPY dedupe_predicates FROM STDIN WITH CSV ENCODING 'utf-8'",
ReadableIterator(fingerprinted_data),
size=SLICE_SIZE,
)
Expand Down Expand Up @@ -175,15 +175,16 @@ def cluster_rows(clustered_dupes):
"""
CREATE TABLE dedupe_clusters (
id TEXT,
structure_id TEXT,
structure_id VARCHAR(512),
score FLOAT,
size INTEGER,
PRIMARY KEY(structure_id)
)
"""
)
cur.copy_expert(
"COPY dedupe_clusters FROM STDIN WITH CSV ENCODING 'utf-8'",
"COPY dedupe_clusters FROM STDIN WITH CSV HEADER DELIMITER AS ';' ENCODING 'utf-8'",
# everything returned from ReadableIterator not trimmed yet
ReadableIterator(cluster_rows(clustered_dupes)),
size=SLICE_SIZE,
)
Expand Down

0 comments on commit e46332d

Please sign in to comment.