-
Notifications
You must be signed in to change notification settings - Fork 1
/
springer_link_csv_to_bibtex_parser.py
72 lines (60 loc) · 3.24 KB
/
springer_link_csv_to_bibtex_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import re
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
def split_camel_case_joined_names(joined_camel_case_names):
individual_camel_case_names = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)',
joined_camel_case_names)
return [name.group(0) for name in individual_camel_case_names]
def join_names_as_camel_case(name):
names_list = re.split('([^a-zA-Z\u00C0-\u024F\u1E00-\u1EFF])', name)
first_name_lower_case = names_list[0].lower()
other_names_camel_case = [name.capitalize() for name in names_list[1:] if name.isalnum()]
camel_case_list = [first_name_lower_case] + other_names_camel_case
camel_case = ''.join(camel_case_list)
return camel_case
class CsvToBibtexParser:
""" Given a CSV file path to a SpringerLink auto-generated references CSV and an output_file_path, provide the
functionality to parse the CSV into an equivalent bibtex (.bib) format """
def __init__(self, csv_file_path, output_file_path):
self.csv = pd.read_csv(csv_file_path)
self.output_path = output_file_path
def convert_csv_to_bibtex(self):
csv_dict = self.csv.to_dict('records')
writer = BibTexWriter()
with open(self.output_path, 'w', encoding="utf-8") as bibtex_file:
for csv_entry in csv_dict:
bibtex_entry = self.convert_csv_entry_to_bibtex_entry(csv_entry)
bibtex_file.write(writer.write(bibtex_entry))
def convert_csv_entry_to_bibtex_entry(self, document_record):
bibtex_key = self.create_bibtex_entry_key_from_csv_entry(document_record)
bibtex_entry = BibDatabase()
authors_list = self.get_authors_from_csv_entry(document_record)
formatted_authors_list = self.remove_braces_and_quotes_from_authors_list(authors_list)
bibtex_entry.entries = [
{'journal': str(document_record['Publication Title']),
'title': str(document_record['Item Title']),
'author': formatted_authors_list,
'year': str(document_record['Publication Year']),
'doi': str(document_record['Item DOI']),
'url': str(document_record['URL']),
'ENTRYTYPE': str(document_record['Content Type']),
'ID': bibtex_key}
]
return bibtex_entry
def create_bibtex_entry_key_from_csv_entry(self, csv_entry):
document_authors = self.get_authors_from_csv_entry(csv_entry)
first_author = document_authors[0]
first_author_camel_case = join_names_as_camel_case(first_author)
document_year = csv_entry['Publication Year']
return first_author_camel_case + str(document_year)
@staticmethod
def get_authors_from_csv_entry(csv_entry):
document_authors = str(csv_entry['Authors'])
document_authors_list = split_camel_case_joined_names(document_authors)
return document_authors_list
@staticmethod
def remove_braces_and_quotes_from_authors_list(authors_list):
authors_list_without_braces = str(authors_list)[1:-1]
authors_list_without_braces_or_quotes = str(authors_list_without_braces).replace("'", "")
return authors_list_without_braces_or_quotes