Skip to content

Commit

Permalink
Merge branch 'release/1.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
fedelemantuano committed Mar 17, 2017
2 parents c80b603 + d6d327c commit 04383a9
Show file tree
Hide file tree
Showing 10 changed files with 2,486 additions and 44 deletions.
29 changes: 29 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
language: python

python:
- "2.7"
- "3.3"
- "3.4"
- "3.5"
- "3.6"

env:
- TIKA_APP_PATH=/tmp/tika-app-1.14.jar

before_script:
- curl -o ${TIKA_APP_PATH} https://archive.apache.org/dist/tika/tika-app-1.14.jar

# command to install dependencies
install:
- pip install -r requirements.txt
- pip install coveralls

# command to run tests
script:
- coverage run --source=tikapp/ --omit=tikapp/__main__.py tests/test_tika_app.py
- coverage run --source=tikapp/ --omit=tikapp/__main__.py tests/performance.py
- python -m tikapp -v
- python -m tikapp -h

after_success:
coveralls
20 changes: 14 additions & 6 deletions README
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
|PyPI version| |Build Status| |Coverage Status|

tika-app-python
===============

Expand Down Expand Up @@ -44,8 +46,8 @@ or use ``pip``:

pip install tika-app

Usage
-----
Usage in a project
------------------

Import ``TikaApp`` class:

Expand Down Expand Up @@ -95,7 +97,7 @@ Usage from command-line
If you installed tika-app-python with ``pip`` or ``setup.py`` you can
use it with command-line. To use tika-app-python you should submit the
Apache Tika app JAR. You can: - leave the default value:
``/opt/tika/tika-app-1.13.jar`` - set the enviroment value
``/opt/tika/tika-app-1.14.jar`` - set the enviroment value
``TIKA_APP_JAR`` - use ``--jar`` switch

The last one overwrite all the others.
Expand Down Expand Up @@ -124,9 +126,8 @@ These are all swithes:

Example:

.. code:: shell
\`\`\`shell $ tikapp -f example\_file -a

$ tikapp -f example_file -a

Performance tests
-----------------
Expand All @@ -149,5 +150,12 @@ These are the results of performance tests in `tests`_ folder:
tika_extract_all_content() 0.785915 sec
tika_extract_only_content() 0.766517 sec

.. _Apache Tika App: https://tika.apache.org/
.. _tests: https://github.com/fedelemantuano/tika-app-python/tree/develop/tests
.. _Apache Tika App: https://tika.apache.org/

.. |PyPI version| image:: https://badge.fury.io/py/tika-app.svg
:target: https://badge.fury.io/py/tika-app
.. |Build Status| image:: https://travis-ci.org/fedelemantuano/tika-app-python.svg?branch=develop
:target: https://travis-ci.org/fedelemantuano/tika-app-python
.. |Coverage Status| image:: https://coveralls.io/repos/github/fedelemantuano/tika-app-python/badge.svg?branch=develop
:target: https://coveralls.io/github/fedelemantuano/tika-app-python?branch=develop
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[![PyPI version](https://badge.fury.io/py/tika-app.svg)](https://badge.fury.io/py/tika-app)
[![Build Status](https://travis-ci.org/fedelemantuano/tika-app-python.svg?branch=develop)](https://travis-ci.org/fedelemantuano/tika-app-python)
[![Coverage Status](https://coveralls.io/repos/github/fedelemantuano/tika-app-python/badge.svg?branch=develop)](https://coveralls.io/github/fedelemantuano/tika-app-python?branch=develop)

# tika-app-python

## Overview
Expand Down Expand Up @@ -83,7 +87,7 @@ tika_client.extract_only_content(payload="base64_payload")

If you installed tika-app-python with `pip` or `setup.py` you can use it with command-line.
To use tika-app-python you should submit the Apache Tika app JAR. You can:
- leave the default value: `/opt/tika/tika-app-1.13.jar`
- leave the default value: `/opt/tika/tika-app-1.14.jar`
- set the enviroment value `TIKA_APP_JAR`
- use `--jar` switch

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
chainmap==1.0.2
mail-parser==1.1.6
python-magic==0.4.12
simplejson==3.10.0
six==1.10.0
5 changes: 0 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,8 @@
"Intended Audience :: Developers",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.0",
"Programming Language :: Python :: 3.1",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5",
Expand Down
2,377 changes: 2,377 additions & 0 deletions tests/files/mail_test_1

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions tests/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,20 @@
test_zip = os.path.join(profiling_path, "files", "lorem_ipsum.txt.zip")
test_txt = os.path.join(profiling_path, "files", "lorem_ipsum.txt")

TIKA_JAR = "/opt/tika/tika-app-1.14.jar"
try:
TIKA_APP_PATH = os.environ["TIKA_APP_PATH"]
except KeyError:
TIKA_APP_PATH = "/opt/tika/tika-app-1.14.jar"


def tika_content_type():
tika_client = TikaApp(file_jar=TIKA_JAR)
tika_client = TikaApp(file_jar=TIKA_APP_PATH)
output = tika_client.detect_content_type(path=test_zip)
return output


def tika_detect_language():
tika_client = TikaApp(file_jar=TIKA_JAR)
tika_client = TikaApp(file_jar=TIKA_APP_PATH)
output = tika_client.detect_language(path=test_zip)
return output

Expand All @@ -53,13 +56,13 @@ def magic_content_type():


def tika_extract_all_content(memory=None):
tika_client = TikaApp(file_jar=TIKA_JAR, memory_allocation=memory)
tika_client = TikaApp(file_jar=TIKA_APP_PATH, memory_allocation=memory)
output = tika_client.extract_all_content(path=test_zip)
return output


def tika_extract_only_content(memory=None):
tika_client = TikaApp(file_jar=TIKA_JAR, memory_allocation=memory)
tika_client = TikaApp(file_jar=TIKA_APP_PATH, memory_allocation=memory)
output = tika_client.extract_only_content(path=test_zip)
return output

Expand Down
58 changes: 36 additions & 22 deletions tests/test_tika_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,44 +28,64 @@
except ImportError:
import json

from mailparser import MailParser


unittest_path = os.path.realpath(os.path.dirname(__file__))
root = os.path.join(unittest_path, '..')
sys.path.append(root)
test_txt = os.path.join(unittest_path, 'files', 'test.txt')
test_zip = os.path.join(unittest_path, 'files', 'test.zip')
mail_test_1 = os.path.join(unittest_path, 'files', 'mail_test_1')

TIKA_JAR = "/opt/tika/tika-app-1.14.jar"
try:
TIKA_APP_PATH = os.environ["TIKA_APP_PATH"]
except KeyError:
TIKA_APP_PATH = "/opt/tika/tika-app-1.14.jar"

import tikapp as tika
from tikapp.exceptions import TikaAppJarError, FilePathError


class TestTikaApp(unittest.TestCase):

def setUp(self):
# Init
self.parser = MailParser()
self.tika = tika.TikaApp(file_jar=TIKA_APP_PATH)

def test_JSONDecodeError(self):
self.parser.parse_from_file(mail_test_1)

for i in self.parser.attachments_list:

r = self.tika.extract_all_content(
payload=i["payload"], convert_to_obj=False)
self.assertIsInstance(r, six.text_type)

r = self.tika.extract_all_content(
payload=i["payload"], convert_to_obj=True)
self.assertIsInstance(r, list)

def test_tikaappjarerror(self):
with self.assertRaises(TikaAppJarError):
tika.TikaApp()

def test_filepatherror(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)

with self.assertRaises(TypeError):
tika_app.extract_all_content(path=None, payload=None)
self.tika.extract_all_content(path=None, payload=None)

with self.assertRaises(FilePathError):
tika_app.extract_all_content(
self.tika.extract_all_content(
path="/tmp/fake_rand_file", payload=None)

def test_generic(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)
self.assertIsInstance(tika_app.generic(), six.text_type)
self.assertIsInstance(self.tika.generic(), six.text_type)

def test_extract_all_content_file(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)
self.assertEqual(TIKA_APP_PATH, self.tika.file_jar)

self.assertEqual(TIKA_JAR, tika_app.file_jar)

result = tika_app.extract_all_content(test_zip)
result = self.tika.extract_all_content(test_zip)
self.assertIsInstance(result, six.text_type)

result_obj = json.loads(result, encoding="utf-8")
Expand All @@ -79,9 +99,7 @@ def test_extract_all_content_file(self):
self.assertEqual(result_obj[1]["resourceName"], "test.txt")

def test_extract_all_content_file_obj(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)

result_obj = tika_app.extract_all_content(
result_obj = self.tika.extract_all_content(
path=test_zip, convert_to_obj=True)

self.assertIsInstance(result_obj, list)
Expand All @@ -93,13 +111,11 @@ def test_extract_all_content_file_obj(self):
self.assertEqual(result_obj[1]["resourceName"], "test.txt")

def test_extract_all_content_buffer(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)

with open(test_zip, 'rb') as f:
payload = base64.b64encode(f.read())

result_file = tika_app.extract_all_content(path=test_zip)
result_payload = tika_app.extract_all_content(payload=payload)
result_file = self.tika.extract_all_content(path=test_zip)
result_payload = self.tika.extract_all_content(payload=payload)

self.assertIsInstance(result_file, six.text_type)
self.assertIsInstance(result_payload, six.text_type)
Expand All @@ -117,13 +133,11 @@ def test_extract_all_content_buffer(self):
result_payload_obj[1]["resourceName"])

def test_detect_language(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)
result = tika_app.detect_language(path=test_txt)
result = self.tika.detect_language(path=test_txt)
self.assertEqual(result, "en")

def test_extract_only_content(self):
tika_app = tika.TikaApp(file_jar=TIKA_JAR)
result = tika_app.extract_only_content(path=test_txt)
result = self.tika.extract_only_content(path=test_txt)
self.assertIsInstance(result, six.text_type)
self.assertIn("test", result)

Expand Down
19 changes: 15 additions & 4 deletions tikapp/tikapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
from __future__ import unicode_literals
import logging
import os
from subprocess import Popen, PIPE, STDOUT
import subprocess

import six
from .exceptions import TikaAppJarError
from .utils import file_path, clean, sanitize

Expand Down Expand Up @@ -81,15 +82,25 @@ def _command_template(self, switches):
Standard output data (unicode Python 2, str Python 3)
"""

command = ["java", "-jar", self.file_jar]
command = ["java", "-jar", self.file_jar, "-eUTF-8"]

if self.memory_allocation:
command.append("-Xmx{}".format(self.memory_allocation))

command.extend(switches)

out = Popen(command, stdin=PIPE, stdout=PIPE, stderr=STDOUT)
stdoutdata, stderrdata = out.communicate()
if six.PY2:
with open(os.devnull, "w") as devnull:
out = subprocess.Popen(
command, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=devnull)
elif six.PY3:
out = subprocess.Popen(
command, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)

stdoutdata, _ = out.communicate()

return stdoutdata.decode("utf-8").strip()

def generic(self, switches=["--help"]):
Expand Down
2 changes: 1 addition & 1 deletion tikapp/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
limitations under the License.
"""

__version__ = "1.0.1"
__version__ = "1.1.0"

if __name__ == "__main__":
print(__version__)

0 comments on commit 04383a9

Please sign in to comment.