-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_xml_pdf_for_hw.py
148 lines (120 loc) · 4.8 KB
/
prepare_xml_pdf_for_hw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import glob
import settings as settings
import zipfile
import arrow
import os
import shutil
import logging
"""
look in unpacked_renamed_ejp_files
look for all matching pdf and xml files
elife_poa_e000213.xml
elife_poa_e000213.pdf
If there is an xml or pdf file that is not matched, log an error
for the day of delivery take these files and put them into a zip file named
elife_poa_YYYYMMDD.zip
put that zip file into `ftp-to-hw`
move processed pdf and xml files into
made_ftp_ready_on/YYYMMDD
GOTCHAS
When run multiple times it may possibly corrupt exising zip files, worthy of investigation.
"""
## Setup logging
# local logger
logger = logging.getLogger('prepPdfXMLforFTP')
hdlr = logging.FileHandler('prepPdfXMLforFTP.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.INFO)
# global logger
workflow_logger = logging.getLogger('ejp_to_hw_workflow')
hdlr = logging.FileHandler('ejp_to_hw_workflow.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
workflow_logger.addHandler(hdlr)
workflow_logger.setLevel(logging.INFO)
def zip(src, dst):
zf = zipfile.ZipFile("%s.zip" % (dst), "w")
abs_src = os.path.abspath(src)
for dirname, subdirs, files in os.walk(src):
for filename in files:
absname = os.path.abspath(os.path.join(dirname, filename))
arcname = absname[len(abs_src) + 1:]
logger.info('zipping %s as %s' % (os.path.join(dirname, filename),
arcname))
zf.write(absname, arcname)
zf.close()
def check_matching_files_exist(pdf_file_articles_numbers, xml_file_articles_numbers):
for file in pdf_file_articles_numbers:
if file not in xml_file_articles_numbers: logger.warning(str(file) + " has no xml match")
for file in xml_file_articles_numbers:
if file not in pdf_file_articles_numbers: logger.warning(str(file) + " has no pdf match")
def zip_matching_files(pdf_file_articles_numbers, xml_file_articles_numbers, zf, sourcedir):
for file in pdf_file_articles_numbers:
if file in xml_file_articles_numbers:
absname = file + ".pdf"
arcname = absname.split(os.sep)[-1]
zf.write(sourcedir + "/" + absname, arcname)
absname = file + ".xml"
arcname = absname.split(os.sep)[-1]
zf.write(sourcedir + "/" + absname, arcname)
def move_zipfile_to_hw_staging(xml_pdf_zip, ftp_to_hw):
shutil.move(xml_pdf_zip, ftp_to_hw + "/" + xml_pdf_zip)
def move_processed_files(pdf_file_articles_numbers, xml_file_articles_numbers, sourcedir, made_ftp_ready):
for file in pdf_file_articles_numbers:
if file in xml_file_articles_numbers:
absname = file + ".pdf"
arcname = absname.split(os.sep)[-1]
shutil.move(sourcedir + "/" + arcname, made_ftp_ready + "/" + arcname)
absname = file + ".xml"
arcname = absname.split(os.sep)[-1]
shutil.move(sourcedir + "/" + arcname, made_ftp_ready + "/" + arcname)
def set_datestamp():
a = arrow.utcnow()
date_stamp = str(a.datetime.year) + str(a.datetime.month).zfill(2) + str(a.datetime.day).zfill(2)
return date_stamp
def set_xml_pdf_zip_name():
date_stamp = set_datestamp()
xml_pdf_zip = "elife_poa_" + date_stamp + ".zip"
return xml_pdf_zip
def set_made_ftp_ready_dir():
date_stamp = set_datestamp()
made_ftp_ready = settings.MADE_FTP_READY
made_ftp_ready_dir = made_ftp_ready + "/" + date_stamp
if not os.path.exists(made_ftp_ready_dir):
os.makedirs(made_ftp_ready_dir)
return made_ftp_ready_dir
def get_filename_from_path(f, extension):
"""
Get a filename minus the supplied file extension
and without any folder or path
"""
filename = f.split(extension)[0]
# Remove path if present
try:
filename = filename.split(os.sep)[-1]
except:
pass
return filename
def prepare_pdf_xml_for_ftp():
sourcedir = settings.STAGING_TO_HW_DIR
ftp_to_hw = settings.FTP_TO_HW_DIR
pdf_files = glob.glob(sourcedir + "/*.pdf")
xml_files = glob.glob(sourcedir + "/*.xml")
pdf_file_articles_numbers = []
xml_file_articles_numbers = []
for f in pdf_files: pdf_file_articles_numbers.append(get_filename_from_path(f, ".pdf"))
for f in xml_files: xml_file_articles_numbers.append(get_filename_from_path(f, ".xml"))
made_ftp_ready_dir = set_made_ftp_ready_dir()
xml_pdf_zip = set_xml_pdf_zip_name()
zf = zipfile.ZipFile(xml_pdf_zip, "w")
check_matching_files_exist(pdf_file_articles_numbers, xml_file_articles_numbers)
zip_matching_files(pdf_file_articles_numbers, xml_file_articles_numbers, zf, sourcedir)
# Close zip file before moving
zf.close()
move_zipfile_to_hw_staging(xml_pdf_zip, ftp_to_hw)
move_processed_files(pdf_file_articles_numbers, xml_file_articles_numbers, sourcedir, made_ftp_ready_dir)
if __name__ == "__main__":
prepare_pdf_xml_for_ftp()
workflow_logger.info("pdf and xml files prepared in readyness to ftp")