Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Poppler last version #1

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*.pyc

\.ropeproject/
65 changes: 21 additions & 44 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,37 +1,34 @@
#
# Multivio docker build
#
FROM debian:jessie
FROM debian:stretch-slim
MAINTAINER Johnny Mariéthoz <[email protected]>

# Node.js, bower, less, clean-css, uglify-js, requirejs
RUN apt-get update

RUN apt-get -qy upgrade --fix-missing --no-install-recommends

# Install dependencies
RUN apt-get -qy install --fix-missing --no-install-recommends \
g++ make git python python-dev python-pip \
swig cmake fontconfig libfontconfig1-dev \
libjpeg-dev libtiff-dev libopenjpeg-dev \
libapache2-mod-wsgi apache2 wget unzip
g++ cmake make git python3 python3-dev python3-pip python3-setuptools\
fontconfig libfontconfig1-dev zlib1g zlib1g-dev libpng-dev lib32z1-dev\
libjpeg-dev libtiff-dev libopenjp2-7-dev \
libapache2-mod-wsgi-py3 apache2 wget unzip

RUN pip3 install Cython

RUN pip3 install Pillow
WORKDIR /code

# Poppler
RUN git clone git://git.freedesktop.org/git/poppler/poppler
RUN git clone git://git.freedesktop.org/git/poppler/poppler

WORKDIR /code/poppler

# Patch poppler > 0.19
#RUN git checkout -b multivio poppler-0.38.0 \
# && perl -pi.bak -e 's/globalParams->getOverprintPreview\(\)/gTrue/g' poppler/SplashOutputDev.h

#poppler 0.18
RUN git checkout poppler-0.18

RUN mkdir -p /code/poppler/build && cd /code/poppler/build \
&& cmake -Wno-dev -D ENABLE_XPDF_HEADERS=True ../ \
&& make -j 2 install
&& make -j 2 install

# make libpoppler globally available
RUN ldconfig /usr/local/lib
Expand All @@ -41,15 +38,10 @@ COPY . /code/multivio
WORKDIR /code/multivio

# Basic Python
RUN pip install --upgrade pip setuptools \
RUN pip3 install --upgrade pip setuptools \
#install multivio
&& pip install --global-option=build_ext .

# Multivio client

#RUN adduser --uid 1000 --disabled-password --gecos '' multivio
#RUN chown -R multivio:multivio /code

# apache
RUN mkdir -p /var/log/multivio /var/tmp/multivio /var/www/multivio/server \
&& cp tools/multivio_server.py /var/www/multivio/server \
Expand All @@ -58,31 +50,16 @@ RUN mkdir -p /var/log/multivio /var/tmp/multivio /var/www/multivio/server \
&& cp tools/multivio.conf /etc/apache2/sites-available/ \
&& a2ensite multivio

# apache script
# apache sript
RUN cp scripts/httpd-foreground /usr/local/bin \
&& chmod a+x /usr/local/bin/httpd-foreground

WORKDIR /var/www/multivio/client
RUN wget http://demo.multivio.org/multivio/client_1.0.0.zip \
&& unzip client_1.0.0.zip \
&& mv client_1.0.0/* . \
&& rm -fr client_1.0.0 client_1.0.0.zip \
&& chown -R www-data:www-data /var/www/multivio/client

#WORKDIR /
# Slim down image
RUN rm -fr /code \
RUN apt-get clean autoclean \
&& rm -rf /var/lib/{apt,dpkg}/ \
&& find /usr/share/doc -depth -type f ! -name copyright -delete \
&& find /usr/share/doc -empty -delete \
&& rm -rf /usr/share/man/* /usr/share/groff/* /usr/share/info/* \
&& rm -rf /tmp/* /var/lib/{cache,log}/ /root/.cache/* \
&& apt-get -qy remove --purge make git python-dev python-pip swig wget unzip cmake cpp binutils \
&& apt-get -qy autoremove

#USER multivio
#VOLUME ["/code"]
CMD ["httpd-foreground"]
#WORKDIR /var/www/multivio/client
#RUN wget http://demo.multivio.org/multivio/client_1.0.0.zip \
# && unzip client_1.0.0.zip \
# && mv client_1.0.0/* . \
# && rm -fr client_1.0.0 client_1.0.0.zip \
#cd se && chown -R www-data:www-data /var/www/multivio/client

#CMD ["rerodoc", "--debug", "run", "-h", "0.0.A0.0"]

CMD ["httpd-foreground"]
35 changes: 17 additions & 18 deletions multivio/dc_parser.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Document Parser module for Multivio"""

#==============================================================================
# ==============================================================================
# This file is part of the Multivio software.
# Project : Multivio - https://www.multivio.org/
# Copyright: (c) 2009-2011 RERO (http://www.rero.ch/)
# License : See file COPYING
#==============================================================================
# ==============================================================================

__copyright__ = "Copyright (c) 2009-2011 RERO"
__license__ = "GPL V.2"

#---------------------------- Modules ---------------------------------------
# ---------------------------- Modules ---------------------------------------

# import of standard modules
import sys
Expand All @@ -23,9 +23,10 @@
from xml.dom.minidom import parseString

# local modules
from parser import DocumentParser, ParserError
from multivio.parser import DocumentParser, ParserError

# ----------------------------------- Classes -----------------------------------

#----------------------------------- Classes -----------------------------------

class DublinCoreParser(DocumentParser):
"""To parse PDF document"""
Expand Down Expand Up @@ -53,7 +54,7 @@ def _get_record(self):
self._file_stream.seek(0)
content_str = self._file_stream.read()
doc = parseString(content_str)

records = doc.getElementsByTagName('collection')

# get the id number of the first record
Expand All @@ -62,7 +63,7 @@ def _get_record(self):
"XML/Dublin Core document should contains at lease one record!")
if len(records) > 1:
raise ParserError.InvalidDocument(
"XML/Dublin Core document should not contains more than "\
"XML/Dublin Core document should not contains more than "
"one record!")
return records[0]

Expand All @@ -71,15 +72,15 @@ def get_metadata(self):
record = self._get_record()
metadata = {}
metadata['title'] = self._get_values_for_labels(record,
'title')[0].decode('utf-8')
'title')[0].decode('utf-8')
metadata['creator'] = [v.decode('utf-8') for v in
self._get_values_for_labels(record, 'creator')]
self._get_values_for_labels(record, 'creator')]
metadata['language'] = self._get_values_for_labels(record,
'language')[0].decode('utf-8')
self.logger.debug("Metadata: %s"% json.dumps(metadata, sort_keys=True,
indent=4))
'language')[0].decode('utf-8')
self.logger.debug("Metadata: %s" % json.dumps(metadata, sort_keys=True,
indent=4))
return metadata

def get_physical_structure(self):
"""Get the physical structure of the pdf."""
phys_struct = []
Expand All @@ -90,16 +91,14 @@ def get_physical_structure(self):
'url': url,
'label': url.split('/')[-1]
})
self.logger.debug("Physical Structure: %s"% json.dumps(phys_struct,
sort_keys=True, indent=4))
self.logger.debug("Physical Structure: %s" % json.dumps(phys_struct,
sort_keys=True, indent=4))
return phys_struct


def _get_values_for_labels(self, record, tag_name):
"""Return the value for a xml label."""
res = []
for data_field in record.getElementsByTagNameNS(self._namespace_URI, tag_name):
if data_field.firstChild is not None:
res.append(data_field.firstChild.nodeValue.encode('utf-8'))
return res

96 changes: 49 additions & 47 deletions multivio/dispatcher_app.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,59 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Multivio HTTP requests dispatcher."""

#==============================================================================
# ==============================================================================
# This file is part of the Multivio software.
# Project : Multivio - https://www.multivio.org/
# Copyright: (c) 2009-2011 RERO (http://www.rero.ch/)
# License : See file COPYING
#==============================================================================
# ==============================================================================

__copyright__ = "Copyright (c) 2009-2011 RERO"
__license__ = "GPL V.2"


#---------------------------- Modules -----------------------------------------
# ---------------------------- Modules -----------------------------------------

# import of standard modules
import re
import sys
import sys
from optparse import OptionParser
if sys.version_info < (2, 6):
import simplejson as json
else:
import json

# third party modules
import logger
import multivio.logger as logger
import logging
#import mvo_parser
import processor_app
import parser_app
import version_app
import multivio.processor_app as processor_app
import multivio.parser_app as parser_app
import multivio.version_app as version_app

from web_app import WebApplication
from webprocessor_app import WebProcessorApp
from multivio.web_app import WebApplication
from multivio.webprocessor_app import WebProcessorApp
from mvo_config import MVOConfig
from web_app import ApplicationError
from multivio.web_app import ApplicationError


class DispatcherApp(WebApplication):
""" Dispach http request to several applications given the URI.

This is the entry point of the server application. This class is
responsible to call applications given the URI of the HTTP request.
"""

def __init__(self):
"Simple constructor."
"Simple constructor."

WebApplication.__init__(self)

#application configuration
# application configuration
self._apps = {}

#server applications
# server applications
self._apps['.*?/log/post'] = logger.LoggerApp()
self._apps['.*?/version'] = version_app.VersionApp()
self._apps['.*?/get.*?'] = \
Expand All @@ -64,7 +64,7 @@ def __init__(self):
WebProcessorApp()
self.usage = """<br><h1>Welcome to the multivio server.</h1><br>"""
self.logger = logging.getLogger(MVOConfig.Logger.name+".Dispatcher")

def __call__(self, environ, start_response):
"""Main method to dispatch HTTP requests."""

Expand All @@ -78,47 +78,48 @@ def __call__(self, environ, start_response):
response.extend(["<h3>%s</h3>" % k])
self.logger.debug(self._apps[k].usage)
response.extend([self._apps[k].usage])
return response
return response.encode('utf-8')
for k in self._apps.keys():
print(k) # TODO
if re.match(k, path):
try:
return self._apps[k](environ, start_response)
return self._apps[k](environ, start_response).encode('utf-8')
except (ApplicationError.PermissionDenied,
ApplicationError.UnableToRetrieveRemoteDocument,
ApplicationError.UnsupportedFormat,
ApplicationError.InvalidArgument,
ApplicationError.HttpMethodNotAllowed), exception:
ApplicationError.UnableToRetrieveRemoteDocument,
ApplicationError.UnsupportedFormat,
ApplicationError.InvalidArgument,
ApplicationError.HttpMethodNotAllowed) as exception:
start_response(exception.http_code, [('content-type',
'application/json')])
'application/json')])
self.logger.error("Exception: %s occurs with message: %s" %
(type(exception).__name__, str(exception)))
(type(exception).__name__, str(exception)))
result = {
'err_name': type(exception).__name__,
'err_msg' : str(exception)
'err_name1': type(exception).__name__,
'err_msg': str(exception)
}
return [json.dumps(result, sort_keys=True, indent=4)]
except Exception, exception:
return [(json.dumps(result, sort_keys=True, indent=4)).encode('utf-8')]
except Exception as exception:
start_response('500 Internal Server Error',
[('content-type', 'application/json')])
[('content-type', 'application/json')])
self.logger.error("Exception: %s occurs with message: %s" %
(type(exception).__name__, str(exception)))
(type(exception).__name__, str(exception)))
result = {
'err_name': type(exception).__name__,
'err_msg' : str(exception)
'err_name2': type(exception).__name__,
'err_msg': str(exception)
}
return [json.dumps(result, sort_keys=True, indent=4)]
return [(json.dumps(result, sort_keys=True, indent=4)).encode('utf-8')]
else:
self.logger.error("HTTP: 404 for %s" % path)
start_response('404 File Not Found', [('content-type',
'application/json')])
'application/json')])
result = {
'err_name': "FileNotFound",
'err_msg' : "File not found"
'err_msg': "File not found"
}
return [json.dumps(result, sort_keys=True, indent=4)]
return [(json.dumps(result, sort_keys=True, indent=4)).encode('utf-8')]


#---------------------------- Main Part ---------------------------------------
# ---------------------------- Main Part ---------------------------------------

def main():
"""Main function"""
Expand All @@ -127,17 +128,17 @@ def main():

parser = OptionParser(usage)

parser.set_description ("Web app to test the dispatcher.")
parser.set_description("Web app to test the dispatcher.")

parser.add_option ("-v", "--verbose", dest="verbose",
help="Verbose mode",
action="store_true", default=False)
parser.add_option("-v", "--verbose", dest="verbose",
help="Verbose mode",
action="store_true", default=False)

parser.add_option ("-p", "--port", dest="port",
help="Http Port (Default: 4041)",
type="int", default=4041)
parser.add_option("-p", "--port", dest="port",
help="Http Port (Default: 4041)",
type="int", default=4041)

(options, args) = parser.parse_args ()
(options, args) = parser.parse_args()

if len(args) != 0:
parser.error("Error: incorrect number of arguments, try --help")
Expand All @@ -146,5 +147,6 @@ def main():
server = make_server('', options.port, application)
server.serve_forever()


if __name__ == '__main__':
main()
Loading