-
Notifications
You must be signed in to change notification settings - Fork 1
/
chandigarh_embed.py
56 lines (39 loc) · 1.53 KB
/
chandigarh_embed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import pandas as pd
import tiktoken
from pdfminer.high_level import extract_text
from tqdm import tqdm
from website.embed import embed_text
enc = tiktoken.get_encoding("cl100k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"
def process_row(row: pd.Series) -> pd.Series:
pdf_text = extract_text(f"judgement_pdfs/chandigarh/{row['case_number']}.pdf")
# Add the text to the row
row['pdf_text'] = pdf_text
# Compute the embedding using OpenAI text-embedding-ada-002 model
row['embedding'] = embed_text(pdf_text)
return row
def main():
output_df = pd.read_csv('embeddings/chandigarh.csv')
# Load the CSV file
source_df = pd.read_csv('structured_judgements/chandigarh.csv')
# Add columns for pdf_text and embedding
source_df['pdf_text'] = None
source_df['embedding'] = None
row_num = 0
# Iterate over the rows of the DataFrame
for index, row in tqdm(source_df.iterrows(), total=len(source_df)):
# Skip rows that have already been processed
if not pd.isna(output_df.loc[index]['pdf_text']):
print(f"Skipping row {row_num} as it has already been embedded")
row_num += 1
continue
try:
# Process the row
source_df.loc[index] = process_row(row)
except Exception as e:
print(f"Error processing row {row_num}: {e}")
row_num += 1
# Save the DataFrame to a CSV file
source_df.to_csv('embeddings/chandigarh.csv', index=False)
if __name__ == '__main__':
main()