-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_empty_subfields_and_delimiters_git.py
143 lines (124 loc) · 4.98 KB
/
find_empty_subfields_and_delimiters_git.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from pymarc import *
import re
from datetime import datetime
## Script by Arcadia Falcone, arcadia.falcone at gmail, updated 12/19/2014
## Checks for errors causing Solr ingest to fail: empty subfields and empty
## delimiters. Outputs detailed error log.
def openFilesIO():
"""Select input file, and create updated MARC file and error log text file
for output, with file names based on the input file selected."""
import os
import Tkinter, tkFileDialog
root = Tkinter.Tk()
root.withdraw()
fileselect = tkFileDialog.askopenfilename()
filepath = os.path.dirname(fileselect)
filename = os.path.basename(fileselect)
basename = os.path.splitext(filename)[0]
infile = os.path.abspath(fileselect)
outfile_base = filepath + '/' + basename
errorfile = outfile_base + '_error.txt'
logfile = outfile_base + '_log.txt'
return infile, errorfile, logfile
### Variables ###
bib_formats = {'a': 'print', 'c': 'music', 'd': 'music', 'e': 'maps',
'f': 'maps', 'g': 'visual', 'i': 'audio', 'j': 'audio', 'k': 'visual',
'm': 'digital', 'o': 'kit', 'p': 'archives', 'r': 'object', 't': 'print'}
# Regex
space_re = re.compile(r'\s*$')
valid_code_re = re.compile(r'[a-z0-9]')
alpha_num_re = re.compile(r'.*?[A-Za-z0-9]+')
repeated_sub_code_re = re.compile(r'\$([a-z0-9])\s*\$\1')
terminal_empty_sub_re = re.compile(r'\$[a-z0-9]$')
### Process ###
# Record start time
start_time = datetime.now()
# Select MARC file for processing and create output files
print 'Select the MARC file for error validation.'
marc_file, error_file, log_file = openFilesIO()
outfile = open(error_file, 'w')
log = open(log_file, 'w')
print 'Processing %s...' % marc_file
# Write headers to output file
headers = ['bib_id', 'marc_field', 'field_value', 'error_type', 'format',
'language', 'batch', 'errors_per_bib']
outfile.write('\t'.join(headers) + '\n')
process_count = 0
error_count = 0
record_count = 0
reader = MARCReader(file(marc_file))
for record in reader:
process_count += 1
# Reset values for each record
output_flag = False
record_output = []
# Set current record ID and get all fields
record_id = record['001'].value()
all_fields = record.get_fields()
for field in all_fields:
# Skip control fields (don't have subfields)
if field.is_control_field():
continue
if field.subfields:
# Reset values for each field
i = 0
old_errors = []
# Iterate through subfields and analyze for errors
for subfield in field.subfields:
old_error_count = error_count
i += 1
if i % 2 != 0:
if re.match(space_re, subfield):
error = 'empty delimiter'
error_count += 1
elif not re.match(valid_code_re, subfield):
error = 'invalid subfield code'
error_count += 1
else:
if re.match(space_re, subfield):
error = 'empty subfield'
error_count += 1
# Add new error to field errors if not duplicate
if error_count > old_error_count and error not in old_errors:
old_errors.append(error)
# Get additional metadata from the record
bib_format = bib_formats[record.leader[6]]
language = record['008'].value()[35:38]
# Identify candidates for batch processing
if re.search(repeated_sub_code_re, str(field)):
batch = 'batch - repeated subfield code'
elif re.search(terminal_empty_sub_re, str(field)):
batch = 'batch - terminal empty subfield'
else:
batch = ''
# Set output values
record_output.append([record_id, field.tag, str(field),
error, bib_format, language, batch])
output_flag = True
else:
print '%s: no subfields in %s' % (record_id, str(field))
if output_flag == True:
record_count += 1
# Note whether same record has one error, or multiple errors
if len(record_output) > 1:
error_num = 'multiple'
else:
error_num = 'single'
# Write record error data to output file
for output in record_output:
output.append(error_num)
outfile.write('\t'.join(output) + '\n')
# Write summary of results to log file
stop_time = datetime.now()
log.write('Process started: %s' % start_time + '\n')
log.write('Process completed: %s' % stop_time + '\n\n')
log.write('%d records processed.' % process_count + '\n')
log.write('%d errors identified.' % error_count + '\n')
log.write('%d records with errors.' % record_count + '\n')
# Write summary of results to console
print 'Records processed: %d' % process_count
print 'Errors identified: %d' % error_count
print 'Records with errors: %d' % record_count
# Close files
outfile.close()
log.close()