diff --git a/image_match/elasticsearchflat_driver.py b/image_match/elasticsearchflat_driver.py new file mode 100644 index 0000000..259dba3 --- /dev/null +++ b/image_match/elasticsearchflat_driver.py @@ -0,0 +1,242 @@ +from image_match.signature_database_base import SignatureDatabaseBase +from image_match.signature_database_base import normalized_distance +from image_match.signature_database_base import make_record +from datetime import datetime +from itertools import product +from operator import itemgetter +import numpy as np +from collections import deque + + +class SignatureES(SignatureDatabaseBase): + """Elasticsearch driver for image-match + + This driver deals with document where all simple words, from 1 to N, are stored + in a single string field in the document, named "simple_words". The words are + string separated, like "11111 22222 33333 44444 55555 66666 77777". + + The field is queried with a "match" on "simple_words" with a "minimum_should_match" + given as a parameter. i.e. the following document: + {"simple_words": "11111 22222 33333 44444 55555 66666 77777"} + will be returned by the search_single_record function by the given image words: + 11111 99999 33333 44444 00000 55555 88888 + if minimum_should_match is below or equal to 4, because for words are in common. + + The order of the words in the string field is maintained, although it does not make any + sort of importance because of the way the field is queried. + + """ + + def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100, minimum_should_match=6, + *args, **kwargs): + """Extra setup for Elasticsearch + + Args: + es (elasticsearch): an instance of the elasticsearch python driver + index (Optional[string]): a name for the Elasticsearch index (default 'images') + doc_type (Optional[string]): a name for the document time (default 'image') + timeout (Optional[int]): how long to wait on an Elasticsearch query, in seconds (default 10) + size (Optional[int]): maximum number of Elasticsearch results (default 100) + minimum_should_match (Optional[int]): maximum number of common words in the queried image + and the document (default 6). + *args (Optional): Variable length argument list to pass to base constructor + **kwargs (Optional): Arbitrary keyword arguments to pass to base constructor + + Examples: + >>> from elasticsearch import Elasticsearch + >>> from image_match.elasticsearch_driver import SignatureES + >>> es = Elasticsearch() + >>> ses = SignatureES(es) + >>> ses.add_image('https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg') + >>> ses.search_image('https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg') + [ + {'dist': 0.0, + 'id': u'AVM37nMg0osmmAxpPvx6', + 'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg', + 'score': 0.28797293} + ] + + """ + self.es = es + self.index = index + self.doc_type = doc_type + self.timeout = timeout + self.size = size + self.minimum_should_match = minimum_should_match + + super(SignatureES, self).__init__(*args, **kwargs) + + def search_single_record(self, rec, pre_filter=None): + path = rec.pop('path') + signature = rec.pop('signature') + if 'metadata' in rec: + rec.pop('metadata') + + query = { + 'query': { + 'bool': { + 'must': { + 'match': { + 'simple_words': { + "query": rec["simple_words"], + 'minimum_should_match': str(self.minimum_should_match) + } + }, + } + } + }, + '_source': {'excludes': ['simple_words']} + } + + if pre_filter is not None: + query['query']['bool']['filter'] = pre_filter + + # Perform minimum_should_match request + res = self.es.search(index=self.index, + doc_type=self.doc_type, + body=query, + size=self.size, + timeout=self.timeout)['hits']['hits'] + + sigs = np.array([x['_source']['signature'] for x in res]) + + if sigs.size == 0: + return [] + + dists = normalized_distance(sigs, np.array(signature)) + + formatted_res = [{'id': x['_id'], + 'score': x['_score'], + 'metadata': x['_source'].get('metadata'), + 'path': x['_source'].get('url', x['_source'].get('path'))} + for x in res] + + for i, row in enumerate(formatted_res): + row['dist'] = dists[i] + formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res) + + return formatted_res + + def insert_single_record(self, rec, refresh_after=False): + rec['timestamp'] = datetime.now() + + self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after) + + def delete_duplicates(self, path): + """Delete all but one entries in elasticsearch whose `path` value is equivalent to that of path. + Args: + path (string): path value to compare to those in the elastic search + """ + matching_paths = [item['_id'] for item in + self.es.search(body={'query': + {'match': + {'path': path} + } + }, + index=self.index)['hits']['hits'] + if item['_source']['path'] == path] + if len(matching_paths) > 0: + for id_tag in matching_paths[1:]: + self.es.delete(index=self.index, doc_type=self.doc_type, id=id_tag) + + def add_image(self, path, img=None, bytestream=False, metadata=None, refresh_after=False): + """Add a single image to the database + + Overwrite the base function to search by flat image (call to make_record with flat=True) + + Args: + path (string): path or identifier for image. If img=None, then path is assumed to be + a URL or filesystem path + img (Optional[string]): usually raw image data. In this case, path will still be stored, but + a signature will be generated from data in img. If bytestream is False, but img is + not None, then img is assumed to be the URL or filesystem path. Thus, you can store + image records with a different 'path' than the actual image location (default None) + bytestream (Optional[boolean]): will the image be passed as raw bytes? + That is, is the 'path_or_image' argument an in-memory image? If img is None but, this + argument will be ignored. If img is not None, and bytestream is False, then the behavior + is as described in the explanation for the img argument + (default False) + metadata (Optional): any other information you want to include, can be nested (default None) + + """ + rec = make_record(path, self.gis, self.k, self.N, img=img, bytestream=bytestream, metadata=metadata, flat=True) + self.insert_single_record(rec, refresh_after=refresh_after) + + def search_image(self, path, all_orientations=False, bytestream=False, pre_filter=None): + """Search for matches + + Overwrite the base function to search by flat image (call to make_record with flat=True) + + Args: + path (string): path or image data. If bytestream=False, then path is assumed to be + a URL or filesystem path. Otherwise, it's assumed to be raw image data + all_orientations (Optional[boolean]): if True, search for all combinations of mirror + images, rotations, and color inversions (default False) + bytestream (Optional[boolean]): will the image be passed as raw bytes? + That is, is the 'path_or_image' argument an in-memory image? + (default False) + pre_filter (Optional[dict]): filters list before applying the matching algorithm + (default None) + Returns: + a formatted list of dicts representing unique matches, sorted by dist + + For example, if three matches are found: + + [ + {'dist': 0.069116439263706961, + 'id': u'AVM37oZq0osmmAxpPvx7', + 'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'}, + {'dist': 0.22484320805049718, + 'id': u'AVM37nMg0osmmAxpPvx6', + 'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'}, + {'dist': 0.42529792112113302, + 'id': u'AVM37p530osmmAxpPvx9', + 'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'} + ] + + """ + img = self.gis.preprocess_image(path, bytestream) + + if all_orientations: + # initialize an iterator of composed transformations + inversions = [lambda x: x, lambda x: -x] + + mirrors = [lambda x: x, np.fliplr] + + # an ugly solution for function composition + rotations = [lambda x: x, + np.rot90, + lambda x: np.rot90(x, 2), + lambda x: np.rot90(x, 3)] + + # cartesian product of all possible orientations + orientations = product(inversions, rotations, mirrors) + + else: + # otherwise just use the identity transformation + orientations = [lambda x: x] + + # try for every possible combination of transformations; if all_orientations=False, + # this will only take one iteration + result = [] + + orientations = set(np.ravel(list(orientations))) + for transform in orientations: + # compose all functions and apply on signature + transformed_img = transform(img) + + # generate the signature + transformed_record = make_record(transformed_img, self.gis, self.k, self.N, flat=True) + + l = self.search_single_record(transformed_record, pre_filter=pre_filter) + result.extend(l) + + ids = set() + unique = [] + for item in result: + if item['id'] not in ids: + unique.append(item) + ids.add(item['id']) + + r = sorted(unique, key=itemgetter('dist')) + return r diff --git a/image_match/elasticsearchflatint_driver.py b/image_match/elasticsearchflatint_driver.py new file mode 100644 index 0000000..13fb47a --- /dev/null +++ b/image_match/elasticsearchflatint_driver.py @@ -0,0 +1,237 @@ +from image_match.signature_database_base import SignatureDatabaseBase +from image_match.signature_database_base import normalized_distance +from image_match.signature_database_base import make_record +from datetime import datetime +from itertools import product +from operator import itemgetter +import numpy as np +from collections import deque + + +class SignatureES(SignatureDatabaseBase): + """Elasticsearch driver for image-match + + This driver deals with document where all simple words, from 1 to N, are stored + in a single string field in the document, named "simple_words". The words are + string separated, like "11111 22222 33333 44444 55555 66666 77777". + + The field is queried with a "match" on "simple_words" with a "minimum_should_match" + given as a parameter. i.e. the following document: + {"simple_words": "11111 22222 33333 44444 55555 66666 77777"} + will be returned by the search_single_record function by the given image words: + 11111 99999 33333 44444 00000 55555 88888 + if minimum_should_match is below or equal to 4, because for words are in common. + + The order of the words in the string field is maintained, although it does not make any + sort of importance because of the way the field is queried. + + """ + + def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100, minimum_should_match=6, + *args, **kwargs): + """Extra setup for Elasticsearch + + Args: + es (elasticsearch): an instance of the elasticsearch python driver + index (Optional[string]): a name for the Elasticsearch index (default 'images') + doc_type (Optional[string]): a name for the document time (default 'image') + timeout (Optional[int]): how long to wait on an Elasticsearch query, in seconds (default 10) + size (Optional[int]): maximum number of Elasticsearch results (default 100) + minimum_should_match (Optional[int]): maximum number of common words in the queried image + and the document (default 6). + *args (Optional): Variable length argument list to pass to base constructor + **kwargs (Optional): Arbitrary keyword arguments to pass to base constructor + + Examples: + >>> from elasticsearch import Elasticsearch + >>> from image_match.elasticsearch_driver import SignatureES + >>> es = Elasticsearch() + >>> ses = SignatureES(es) + >>> ses.add_image('https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg') + >>> ses.search_image('https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg') + [ + {'dist': 0.0, + 'id': u'AVM37nMg0osmmAxpPvx6', + 'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg', + 'score': 0.28797293} + ] + + """ + self.es = es + self.index = index + self.doc_type = doc_type + self.timeout = timeout + self.size = size + self.minimum_should_match = minimum_should_match + + super(SignatureES, self).__init__(*args, **kwargs) + + def search_single_record(self, rec, pre_filter=None): + path = rec.pop('path') + signature = rec.pop('signature') + if 'metadata' in rec: + rec.pop('metadata') + + query = { + 'query': { + 'bool': { + 'should': [{'term': {'simple_words': r}} for r in rec["simple_words"]], + 'minimum_should_match': str(self.minimum_should_match) + } + }, + '_source': {'excludes': ['simple_words']} + } + + if pre_filter is not None: + query['query']['bool']['filter'] = pre_filter + + # Perform minimum_should_match request + res = self.es.search(index=self.index, + doc_type=self.doc_type, + body=query, + size=self.size, + timeout=self.timeout)['hits']['hits'] + + sigs = np.array([x['_source']['signature'] for x in res]) + + if sigs.size == 0: + return [] + + dists = normalized_distance(sigs, np.array(signature)) + + formatted_res = [{'id': x['_id'], + 'score': x['_score'], + 'metadata': x['_source'].get('metadata'), + 'path': x['_source'].get('url', x['_source'].get('path'))} + for x in res] + + for i, row in enumerate(formatted_res): + row['dist'] = dists[i] + formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res) + + return formatted_res + + def insert_single_record(self, rec, refresh_after=False): + rec['timestamp'] = datetime.now() + + self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after) + + def delete_duplicates(self, path): + """Delete all but one entries in elasticsearch whose `path` value is equivalent to that of path. + Args: + path (string): path value to compare to those in the elastic search + """ + matching_paths = [item['_id'] for item in + self.es.search(body={'query': + {'match': + {'path': path} + } + }, + index=self.index)['hits']['hits'] + if item['_source']['path'] == path] + if len(matching_paths) > 0: + for id_tag in matching_paths[1:]: + self.es.delete(index=self.index, doc_type=self.doc_type, id=id_tag) + + def add_image(self, path, img=None, bytestream=False, metadata=None, refresh_after=False): + """Add a single image to the database + + Overwrite the base function to search by flat image (call to make_record with flat=True) + + Args: + path (string): path or identifier for image. If img=None, then path is assumed to be + a URL or filesystem path + img (Optional[string]): usually raw image data. In this case, path will still be stored, but + a signature will be generated from data in img. If bytestream is False, but img is + not None, then img is assumed to be the URL or filesystem path. Thus, you can store + image records with a different 'path' than the actual image location (default None) + bytestream (Optional[boolean]): will the image be passed as raw bytes? + That is, is the 'path_or_image' argument an in-memory image? If img is None but, this + argument will be ignored. If img is not None, and bytestream is False, then the behavior + is as described in the explanation for the img argument + (default False) + metadata (Optional): any other information you want to include, can be nested (default None) + + """ + rec = make_record(path, self.gis, self.k, self.N, img=img, bytestream=bytestream, metadata=metadata, flat=True, + flatint=True) + self.insert_single_record(rec, refresh_after=refresh_after) + + def search_image(self, path, all_orientations=False, bytestream=False, pre_filter=None): + """Search for matches + + Overwrite the base function to search by flat image (call to make_record with flat=True) + + Args: + path (string): path or image data. If bytestream=False, then path is assumed to be + a URL or filesystem path. Otherwise, it's assumed to be raw image data + all_orientations (Optional[boolean]): if True, search for all combinations of mirror + images, rotations, and color inversions (default False) + bytestream (Optional[boolean]): will the image be passed as raw bytes? + That is, is the 'path_or_image' argument an in-memory image? + (default False) + pre_filter (Optional[dict]): filters list before applying the matching algorithm + (default None) + Returns: + a formatted list of dicts representing unique matches, sorted by dist + + For example, if three matches are found: + + [ + {'dist': 0.069116439263706961, + 'id': u'AVM37oZq0osmmAxpPvx7', + 'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'}, + {'dist': 0.22484320805049718, + 'id': u'AVM37nMg0osmmAxpPvx6', + 'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'}, + {'dist': 0.42529792112113302, + 'id': u'AVM37p530osmmAxpPvx9', + 'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'} + ] + + """ + img = self.gis.preprocess_image(path, bytestream) + + if all_orientations: + # initialize an iterator of composed transformations + inversions = [lambda x: x, lambda x: -x] + + mirrors = [lambda x: x, np.fliplr] + + # an ugly solution for function composition + rotations = [lambda x: x, + np.rot90, + lambda x: np.rot90(x, 2), + lambda x: np.rot90(x, 3)] + + # cartesian product of all possible orientations + orientations = product(inversions, rotations, mirrors) + + else: + # otherwise just use the identity transformation + orientations = [lambda x: x] + + # try for every possible combination of transformations; if all_orientations=False, + # this will only take one iteration + result = [] + + orientations = set(np.ravel(list(orientations))) + for transform in orientations: + # compose all functions and apply on signature + transformed_img = transform(img) + + # generate the signature + transformed_record = make_record(transformed_img, self.gis, self.k, self.N, flat=True, flatint=True) + + l = self.search_single_record(transformed_record, pre_filter=pre_filter) + result.extend(l) + + ids = set() + unique = [] + for item in result: + if item['id'] not in ids: + unique.append(item) + ids.add(item['id']) + + r = sorted(unique, key=itemgetter('dist')) + return r diff --git a/image_match/signature_database_base.py b/image_match/signature_database_base.py index 42eadd0..e5cddd0 100644 --- a/image_match/signature_database_base.py +++ b/image_match/signature_database_base.py @@ -286,7 +286,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte return r -def make_record(path, gis, k, N, img=None, bytestream=False, metadata=None): +def make_record(path, gis, k, N, img=None, bytestream=False, metadata=None, flat=False, flatint=False): """Makes a record suitable for database insertion. Note: @@ -311,11 +311,16 @@ def make_record(path, gis, k, N, img=None, bytestream=False, metadata=None): is as described in the explanation for the img argument (default False) metadata (Optional): any other information you want to include, can be nested (default None) + flat (Optional): by default, words are stored in separate properties from simple_word_0 to + simple_word_N (N given as a parameter). When flat is set to True, all the word are stored + into one single 'simple_words' property, as a string, space separated. + flatint (Optional): only if flat is True. Will store words as an array of integers instead of + a long string space separated. Returns: An image record. - For example: + For example, when flat is set to False (default): {'path': 'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg', 'signature': [0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0 ... ] @@ -339,6 +344,24 @@ def make_record(path, gis, k, N, img=None, bytestream=False, metadata=None): 'metadata': {...} } + Or when flat is set to True: + + {'path': 'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg', + 'signature': [0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0 ... ] + 'simple_words': "42252475 23885671 9967839 4257902 28651959 33773597 39331441 39327300 11337345 9571961 + 28697868 14834907 7434746 37985525 10753207 9566120 ..." + 'metadata': {...} + } + + Or when flat is set to True with flatint also set to True: + + {'path': 'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg', + 'signature': [0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0 ... ] + 'simple_words': [42252475, 23885671, 9967839, 4257902, 28651959, 33773597, 39331441, 39327300, 11337345, + 9571961, 28697868, 14834907, 7434746, 37985525, 10753207, 9566120, ...] + 'metadata': {...} + } + """ record = dict() record['path'] = path @@ -357,8 +380,15 @@ def make_record(path, gis, k, N, img=None, bytestream=False, metadata=None): words = words_to_int(words) - for i in range(N): - record[''.join(['simple_word_', str(i)])] = words[i].tolist() + if flat: + if flatint: + record['simple_words'] = words.tolist() + else: + for i in range(N): + record['simple_words'] = " ".join(map(str, words.tolist())) + else: + for i in range(N): + record[''.join(['simple_word_', str(i)])] = words[i].tolist() return record diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6833249 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +elasticsearch==5.4.0 +matplotlib +numpy==1.13.1 +Pillow==4.2.1 +argparse \ No newline at end of file diff --git a/tests/elasticsearch_helper.py b/tests/elasticsearch_helper.py new file mode 100644 index 0000000..cf991c5 --- /dev/null +++ b/tests/elasticsearch_helper.py @@ -0,0 +1,230 @@ +import pytest +import urllib.request +import os +import hashlib +import unittest +from elasticsearch import Elasticsearch, ConnectionError, RequestError, NotFoundError +from time import sleep + +from image_match.elasticsearch_driver import SignatureES +from PIL import Image + +test_img_url1 = 'https://camo.githubusercontent.com/810bdde0a88bc3f8ce70c5d85d8537c37f707abe/68747470733a2f2f75706c6f61642e77696b696d656469612e6f72672f77696b6970656469612f636f6d6d6f6e732f7468756d622f652f65632f4d6f6e615f4c6973612c5f62795f4c656f6e6172646f5f64615f56696e63692c5f66726f6d5f4332524d465f7265746f75636865642e6a70672f36383770782d4d6f6e615f4c6973612c5f62795f4c656f6e6172646f5f64615f56696e63692c5f66726f6d5f4332524d465f7265746f75636865642e6a7067' +test_img_url2 = 'https://camo.githubusercontent.com/826e23bc3eca041110a5af467671b012606aa406/68747470733a2f2f63322e737461746963666c69636b722e636f6d2f382f373135382f363831343434343939315f303864383264653537655f7a2e6a7067' +urllib.request.urlretrieve(test_img_url1, 'test1.jpg') +urllib.request.urlretrieve(test_img_url2, 'test2.jpg') + +INDEX_NAME = 'test_environment_{}'.format(hashlib.md5(os.urandom(128)).hexdigest()[:12]) +DOC_TYPE = 'image' +MAPPINGS = { + "mappings": { + DOC_TYPE: { + "dynamic": True, + "properties": { + "metadata": { + "type": "object", + "dynamic": True, + "properties": { + "tenant_id": {"type": "keyword"} + } + } + } + } + } +} + +class BaseTestsParent: + class BaseTests(unittest.TestCase): + + @property + def es(self): + es_serv = Elasticsearch() + return es_serv + + @property + def ses(self): + es = self.es + return SignatureES(es=es, index=INDEX_NAME, doc_type=DOC_TYPE) + + @pytest.fixture(scope='function', autouse=True) + def setup_index(self, request, index_name): + try: + self.es.indices.create(index=index_name, body=MAPPINGS) + except RequestError as e: + if e.error == u'index_already_exists_exception': + self.es.indices.delete(index_name) + else: + raise + + def fin(): + try: + self.es.indices.delete(index_name) + except NotFoundError: + pass + + request.addfinalizer(fin) + + @pytest.fixture(scope='class') + def index_name(self): + return INDEX_NAME + + @pytest.fixture(scope='function', autouse=True) + def cleanup_index(self, request, index_name): + def fin(): + try: + self.es.indices.delete(index_name) + except NotFoundError: + pass + + request.addfinalizer(fin) + + def test_elasticsearch_running(self): + i = 0 + while i < 5: + try: + self.es.ping() + assert True + return + except ConnectionError: + i += 1 + sleep(2) + + pytest.fail('Elasticsearch not running (failed to connect after {} tries)' + .format(str(i))) + + def test_add_image_by_url(self): + self.ses.add_image(test_img_url1) + self.ses.add_image(test_img_url2) + assert True + + def test_add_image_by_path(self): + self.ses.add_image('test1.jpg') + assert True + + def test_index_refresh(self): + self.ses.add_image('test1.jpg', refresh_after=True) + r = self.ses.search_image('test1.jpg') + assert len(r) == 1 + + def test_add_image_as_bytestream(self): + with open('test1.jpg', 'rb') as f: + self.ses.add_image('bytestream_test', img=f.read(), bytestream=True) + assert True + + def test_add_image_with_different_name(self): + self.ses.add_image('custom_name_test', img='test1.jpg', bytestream=False) + assert True + + def test_lookup_from_url(self): + self.ses.add_image('test1.jpg', refresh_after=True) + r = self.ses.search_image(test_img_url1) + assert len(r) == 1 + assert r[0]['path'] == 'test1.jpg' + assert 'score' in r[0] + assert 'dist' in r[0] + assert 'id' in r[0] + + def test_lookup_from_file(self): + self.ses.add_image('test1.jpg', refresh_after=True) + r = self.ses.search_image('test1.jpg') + assert len(r) == 1 + assert r[0]['path'] == 'test1.jpg' + assert 'score' in r[0] + assert 'dist' in r[0] + assert 'id' in r[0] + + def test_lookup_from_bytestream(self): + self.ses.add_image('test1.jpg', refresh_after=True) + with open('test1.jpg', 'rb') as f: + r = self.ses.search_image(f.read(), bytestream=True) + assert len(r) == 1 + assert r[0]['path'] == 'test1.jpg' + assert 'score' in r[0] + assert 'dist' in r[0] + assert 'id' in r[0] + + def test_lookup_with_cutoff(self): + self.ses.add_image('test2.jpg', refresh_after=True) + ses = self.ses + ses.distance_cutoff = 0.01 + r = ses.search_image('test1.jpg') + assert len(r) == 0 + + def check_distance_consistency(self): + self.ses.add_image('test1.jpg') + self.ses.add_image('test2.jpg', refresh_after=True) + r = self.ses.search_image('test1.jpg') + assert r[0]['dist'] == 0.0 + assert r[-1]['dist'] == 0.42672771706789686 + + def test_add_image_with_metadata(self): + metadata = {'some_info': + {'test': + 'ok!' + } + } + self.ses.add_image('test1.jpg', metadata=metadata, refresh_after=True) + r = self.ses.search_image('test1.jpg') + assert r[0]['metadata'] == metadata + assert 'path' in r[0] + assert 'score' in r[0] + assert 'dist' in r[0] + assert 'id' in r[0] + + def test_lookup_with_filter_by_metadata(self): + metadata = dict( + tenant_id='foo' + ) + self.ses.add_image('test1.jpg', metadata=metadata, refresh_after=True) + + metadata2 = dict( + tenant_id='bar-2' + ) + self.ses.add_image('test2.jpg', metadata=metadata2, refresh_after=True) + + r = self.ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "foo"}}) + assert len(r) == 1 + assert r[0]['metadata'] == metadata + + r = self.ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}}) + assert len(r) == 1 + assert r[0]['metadata'] == metadata2 + + r = self.ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}}) + assert len(r) == 0 + + def test_all_orientations(self): + im = Image.open('test1.jpg') + im.rotate(90, expand=True).save('rotated_test1.jpg') + + self.ses.add_image('test1.jpg', refresh_after=True) + r = self.ses.search_image('rotated_test1.jpg', all_orientations=True) + assert len(r) == 1 + assert r[0]['path'] == 'test1.jpg' + assert r[0]['dist'] < 0.05 # some error from rotation + + with open('rotated_test1.jpg', 'rb') as f: + r = self.ses.search_image(f.read(), bytestream=True, all_orientations=True) + assert len(r) == 1 + assert r[0]['dist'] < 0.05 # some error from rotation + + def test_duplicate(self): + self.ses.add_image('test1.jpg', refresh_after=True) + self.ses.add_image('test1.jpg', refresh_after=True) + r = self.ses.search_image('test1.jpg') + assert len(r) == 2 + assert r[0]['path'] == 'test1.jpg' + assert 'score' in r[0] + assert 'dist' in r[0] + assert 'id' in r[0] + + def test_duplicate_removal(self): + for i in range(10): + self.ses.add_image('test1.jpg') + sleep(1) + r = self.ses.search_image('test1.jpg') + assert len(r) == 10 + self.ses.delete_duplicates('test1.jpg') + sleep(1) + r = self.ses.search_image('test1.jpg') + assert len(r) == 1 diff --git a/tests/plots/plot_disk_usage.png b/tests/plots/plot_disk_usage.png new file mode 100644 index 0000000..507d4f1 Binary files /dev/null and b/tests/plots/plot_disk_usage.png differ diff --git a/tests/plots/plot_qualitative_flat.png b/tests/plots/plot_qualitative_flat.png new file mode 100644 index 0000000..c61e499 Binary files /dev/null and b/tests/plots/plot_qualitative_flat.png differ diff --git a/tests/plots/plot_qualitative_flatint.png b/tests/plots/plot_qualitative_flatint.png new file mode 100644 index 0000000..0524c7c Binary files /dev/null and b/tests/plots/plot_qualitative_flatint.png differ diff --git a/tests/plots/plot_search_time.png b/tests/plots/plot_search_time.png new file mode 100644 index 0000000..2a493c4 Binary files /dev/null and b/tests/plots/plot_search_time.png differ diff --git a/tests/plots/plot_time_ingestion.png b/tests/plots/plot_time_ingestion.png new file mode 100644 index 0000000..7499826 Binary files /dev/null and b/tests/plots/plot_time_ingestion.png differ diff --git a/tests/test_elasticsearch_driver.py b/tests/test_elasticsearch_driver.py index e55fe5b..26c440c 100644 --- a/tests/test_elasticsearch_driver.py +++ b/tests/test_elasticsearch_driver.py @@ -1,237 +1,5 @@ -import pytest -import urllib.request -import os -import hashlib -from elasticsearch import Elasticsearch, ConnectionError, RequestError, NotFoundError -from time import sleep +from tests.elasticsearch_helper import BaseTestsParent -from image_match.elasticsearch_driver import SignatureES -from PIL import Image -test_img_url1 = 'https://camo.githubusercontent.com/810bdde0a88bc3f8ce70c5d85d8537c37f707abe/68747470733a2f2f75706c6f61642e77696b696d656469612e6f72672f77696b6970656469612f636f6d6d6f6e732f7468756d622f652f65632f4d6f6e615f4c6973612c5f62795f4c656f6e6172646f5f64615f56696e63692c5f66726f6d5f4332524d465f7265746f75636865642e6a70672f36383770782d4d6f6e615f4c6973612c5f62795f4c656f6e6172646f5f64615f56696e63692c5f66726f6d5f4332524d465f7265746f75636865642e6a7067' -test_img_url2 = 'https://camo.githubusercontent.com/826e23bc3eca041110a5af467671b012606aa406/68747470733a2f2f63322e737461746963666c69636b722e636f6d2f382f373135382f363831343434343939315f303864383264653537655f7a2e6a7067' -urllib.request.urlretrieve(test_img_url1, 'test1.jpg') -urllib.request.urlretrieve(test_img_url2, 'test2.jpg') - -INDEX_NAME = 'test_environment_{}'.format(hashlib.md5(os.urandom(128)).hexdigest()[:12]) -DOC_TYPE = 'image' -MAPPINGS = { - "mappings": { - DOC_TYPE: { - "dynamic": True, - "properties": { - "metadata": { - "type": "object", - "dynamic": True, - "properties": { - "tenant_id": { "type": "keyword" } - } - } - } - } - } -} - - -@pytest.fixture(scope='module', autouse=True) -def index_name(): - return INDEX_NAME - -@pytest.fixture(scope='function', autouse=True) -def setup_index(request, index_name): - es = Elasticsearch() - try: - es.indices.create(index=index_name, body=MAPPINGS) - except RequestError as e: - if e.error == u'index_already_exists_exception': - es.indices.delete(index_name) - else: - raise - - def fin(): - try: - es.indices.delete(index_name) - except NotFoundError: - pass - - request.addfinalizer(fin) - -@pytest.fixture(scope='function', autouse=True) -def cleanup_index(request, es, index_name): - def fin(): - try: - es.indices.delete(index_name) - except NotFoundError: - pass - request.addfinalizer(fin) - -@pytest.fixture -def es(): - return Elasticsearch() - -@pytest.fixture -def ses(es, index_name): - return SignatureES(es=es, index=index_name, doc_type=DOC_TYPE) - -def test_elasticsearch_running(es): - i = 0 - while i < 5: - try: - es.ping() - assert True - return - except ConnectionError: - i += 1 - sleep(2) - - pytest.fail('Elasticsearch not running (failed to connect after {} tries)' - .format(str(i))) - - -def test_add_image_by_url(ses): - ses.add_image(test_img_url1) - ses.add_image(test_img_url2) - assert True - - -def test_add_image_by_path(ses): - ses.add_image('test1.jpg') - assert True - - -def test_index_refresh(ses): - ses.add_image('test1.jpg', refresh_after=True) - r = ses.search_image('test1.jpg') - assert len(r) == 1 - - -def test_add_image_as_bytestream(ses): - with open('test1.jpg', 'rb') as f: - ses.add_image('bytestream_test', img=f.read(), bytestream=True) - assert True - - -def test_add_image_with_different_name(ses): - ses.add_image('custom_name_test', img='test1.jpg', bytestream=False) - assert True - - -def test_lookup_from_url(ses): - ses.add_image('test1.jpg', refresh_after=True) - r = ses.search_image(test_img_url1) - assert len(r) == 1 - assert r[0]['path'] == 'test1.jpg' - assert 'score' in r[0] - assert 'dist' in r[0] - assert 'id' in r[0] - - -def test_lookup_from_file(ses): - ses.add_image('test1.jpg', refresh_after=True) - r = ses.search_image('test1.jpg') - assert len(r) == 1 - assert r[0]['path'] == 'test1.jpg' - assert 'score' in r[0] - assert 'dist' in r[0] - assert 'id' in r[0] - -def test_lookup_from_bytestream(ses): - ses.add_image('test1.jpg', refresh_after=True) - with open('test1.jpg', 'rb') as f: - r = ses.search_image(f.read(), bytestream=True) - assert len(r) == 1 - assert r[0]['path'] == 'test1.jpg' - assert 'score' in r[0] - assert 'dist' in r[0] - assert 'id' in r[0] - -def test_lookup_with_cutoff(ses): - ses.add_image('test2.jpg', refresh_after=True) - ses.distance_cutoff=0.01 - r = ses.search_image('test1.jpg') - assert len(r) == 0 - - -def check_distance_consistency(ses): - ses.add_image('test1.jpg') - ses.add_image('test2.jpg', refresh_after=True) - r = ses.search_image('test1.jpg') - assert r[0]['dist'] == 0.0 - assert r[-1]['dist'] == 0.42672771706789686 - - -def test_add_image_with_metadata(ses): - metadata = {'some_info': - {'test': - 'ok!' - } - } - ses.add_image('test1.jpg', metadata=metadata, refresh_after=True) - r = ses.search_image('test1.jpg') - assert r[0]['metadata'] == metadata - assert 'path' in r[0] - assert 'score' in r[0] - assert 'dist' in r[0] - assert 'id' in r[0] - - -def test_lookup_with_filter_by_metadata(ses): - metadata = dict( - tenant_id='foo' - ) - ses.add_image('test1.jpg', metadata=metadata, refresh_after=True) - - metadata2 = dict( - tenant_id='bar-2' - ) - ses.add_image('test2.jpg', metadata=metadata2, refresh_after=True) - - r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "foo"}}) - assert len(r) == 1 - assert r[0]['metadata'] == metadata - - r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}}) - assert len(r) == 1 - assert r[0]['metadata'] == metadata2 - - r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}}) - assert len(r) == 0 - - -def test_all_orientations(ses): - im = Image.open('test1.jpg') - im.rotate(90, expand=True).save('rotated_test1.jpg') - - ses.add_image('test1.jpg', refresh_after=True) - r = ses.search_image('rotated_test1.jpg', all_orientations=True) - assert len(r) == 1 - assert r[0]['path'] == 'test1.jpg' - assert r[0]['dist'] < 0.05 # some error from rotation - - with open('rotated_test1.jpg', 'rb') as f: - r = ses.search_image(f.read(), bytestream=True, all_orientations=True) - assert len(r) == 1 - assert r[0]['dist'] < 0.05 # some error from rotation - - -def test_duplicate(ses): - ses.add_image('test1.jpg', refresh_after=True) - ses.add_image('test1.jpg', refresh_after=True) - r = ses.search_image('test1.jpg') - assert len(r) == 2 - assert r[0]['path'] == 'test1.jpg' - assert 'score' in r[0] - assert 'dist' in r[0] - assert 'id' in r[0] - - -def test_duplicate_removal(ses): - for i in range(10): - ses.add_image('test1.jpg') - sleep(1) - r = ses.search_image('test1.jpg') - assert len(r) == 10 - ses.delete_duplicates('test1.jpg') - sleep(1) - r = ses.search_image('test1.jpg') - assert len(r) == 1 +class ElasticSearchFlatTestSuite(BaseTestsParent.BaseTests): + pass diff --git a/tests/test_elasticsearch_driver_speed.py b/tests/test_elasticsearch_driver_speed.py new file mode 100644 index 0000000..2135fad --- /dev/null +++ b/tests/test_elasticsearch_driver_speed.py @@ -0,0 +1,454 @@ +import os +from elasticsearch import Elasticsearch +import time +from numpy import random +import numpy as np +from PIL import ImageFilter, Image +import matplotlib.pyplot as plt + +from image_match.elasticsearch_driver \ + import SignatureES as SignatureES_fields +from image_match.elasticsearchflat_driver \ + import SignatureES as SignatureES_flat +from image_match.elasticsearchflatint_driver \ + import SignatureES as SignatureES_flatint + +# To run this test, have an elasticsearch on ports 9200 and 9300 +# docker run -d -p 9200:9200 -p 9300:9300 elasticsearch:5.5.2 + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("--delete-indices", default=False, + help="Will delete existing ES indices (test_environment_" + "fields, test_environment_int and test_environment_" + "flatint") +parser.add_argument("--populate-indices", default=False, + help="Ingest into indices all the images from dataset") +parser.add_argument("--max-msm", default=6, + help="Until which minimum should match (msm) value to run " + "this benchmark") +parser.add_argument("--num-random", default=500, + help="Total number of images to search for a given msm. " + "Total num searches = num_random * len(range_msm)") +args = parser.parse_args() + + +# Params +delete_indices = args.delete_indices +populate_indices = args.populate_indices +max_msm = args.max_msm +num_random = args.num_random +range_msm = range(1, max_msm + 1) + + +def noise_generator(noise_type, image): + """ + Found on https://stackoverflow.com/questions/22937589/ + how-to-add-noise-gaussian-salt-and-pepper-etc-to-image-in-python-with-opencv + Generate noise to a given Image based on required noise type + + Input parameters: + image: ndarray (input image data. It will be converted to float) + noise_type: string + 'gauss' Gaussian-distribution based noise + 's&p' Salt and Pepper noise, 0 or 1 + """ + if noise_type == "gauss": + row, col, ch = image.shape + mean = 0.5 + var = 0.01 + sigma = var ** 0.5 + gauss = np.random.normal(mean, sigma, (row, col, ch)) + gauss = gauss.reshape(row, col, ch) + noisy = image + gauss + return noisy.astype('uint8') + elif noise_type == "s&p": + s_vs_p = 0.5 + amount = 0.01 + out = image + # Generate Salt '1' noise + num_salt = np.ceil(amount * image.size * s_vs_p) + coords = [np.random.randint(0, idx - 1, int(num_salt)) + for idx in image.shape] + out[coords] = 255 + # Generate Pepper '0' noise + num_pepper = np.ceil(amount * image.size * (1. - s_vs_p)) + coords = [np.random.randint(0, idx - 1, int(num_pepper)) + for idx in image.shape] + out[coords] = 0 + return out + else: + return image + + +# ES for fields +INDEX_NAME_FIELDS = 'test_environment_fields' +DOC_TYPE_FIELDS = 'image' +MAPPINGS_FIELDS = { + "mappings": { + DOC_TYPE_FIELDS: { + "dynamic": True, + "properties": { + "metadata": { + "type": "nested", + "dynamic": True, + "properties": { + "tenant_id": {"type": "keyword"}, + "project_id": {"type": "keyword"} + } + } + } + } + } +} + +# ES for flat +INDEX_NAME_FLAT = 'test_environment_flat' +DOC_TYPE_FLAT = 'image_flat' +MAPPINGS_FLAT = { + "mappings": { + DOC_TYPE_FLAT: { + "dynamic": True, + "properties": { + "metadata": { + "type": "nested", + "dynamic": True, + "properties": { + "tenant_id": {"type": "keyword"}, + "project_id": {"type": "keyword"} + } + } + } + } + } +} + +# ES for flatint +INDEX_NAME_FLATINT = 'test_environment_flatint' +DOC_TYPE_FLATINT = 'image_flatint' +MAPPINGS_FLATINT = { + "mappings": { + DOC_TYPE_FLATINT: { + "dynamic": True, + "properties": { + "metadata": { + "type": "nested", + "dynamic": True, + "properties": { + "tenant_id": {"type": "keyword"}, + "project_id": {"type": "keyword"} + } + }, + "simple_words": { + "type": "long", + "doc_values": False, + "store": False + } + } + } + } +} + +es = Elasticsearch() + +if delete_indices: + print("Delete indices") + es.indices.delete(INDEX_NAME_FIELDS) + es.indices.delete(INDEX_NAME_FLAT) + es.indices.delete(INDEX_NAME_FLATINT) + + print("Create indices") + es.indices.create(index=INDEX_NAME_FIELDS, body=MAPPINGS_FIELDS) + es.indices.create(index=INDEX_NAME_FLAT, body=MAPPINGS_FLAT) + es.indices.create(index=INDEX_NAME_FLATINT, body=MAPPINGS_FLATINT) + +# Define three ses +print("Created index {} for fields documents".format(INDEX_NAME_FIELDS)) +print("Created index {} for flat documents".format(INDEX_NAME_FLAT)) +print("Created index {} for flatint documents".format(INDEX_NAME_FLATINT)) + +# The relatively small size of returned document (100, which is default) +# can lead to hard to inconsistent results (typically +# documents only found in flat but not in fields) because the correct document +# might not be in the top 100 for a fields search, but in the top 100 for a +# flat or flatint search. +ses_fields = SignatureES_fields(es=es, index=INDEX_NAME_FIELDS, + doc_type=DOC_TYPE_FIELDS, size=100) +ses_flat = SignatureES_flat(es=es, index=INDEX_NAME_FLAT, + doc_type=DOC_TYPE_FLAT, size=100) +ses_flatint = SignatureES_flatint(es=es, index=INDEX_NAME_FLATINT, + doc_type=DOC_TYPE_FLATINT, size=100) + +# Download dataset +print("Download dataset if does not exist") +dataset_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz" +dir_path = os.path.dirname(os.path.realpath(__file__)) +local_file = os.path.join(dir_path, "101_ObjectCategories.tar.gz") +local_directory = os.path.join(dir_path, "101_ObjectCategories") +if not os.path.exists("101_ObjectCategories"): + cmd = "wget {} -O {}".format(dataset_url, local_file) + print(cmd) + os.system(cmd) + + cmd = "tar xzf {}".format(local_file) + print(cmd) + os.system(cmd) + +# Populate the three indexes with images +print("Ingest documents") +all_files = [] +total_time_ingest_fields = 0 +total_time_ingest_flat = 0 +total_time_ingest_flatint = 0 +for root, dirs, files in os.walk(local_directory): + for file in files: + full_path = os.path.join(root, file) + all_files.append(full_path) + + if len(all_files) % 1000 == 0: + print("{} documents ingested (in each index)" + .format(len(all_files))) + + if populate_indices: + t_fields = time.time() + ses_fields.add_image(full_path) + total_time_ingest_fields += (time.time() - t_fields) + + t_flat = time.time() + ses_flat.add_image(full_path) + total_time_ingest_flat += (time.time() - t_flat) + + t_flatint = time.time() + ses_flatint.add_image(full_path) + total_time_ingest_flatint += (time.time() - t_flatint) + +print("{} to ingest fields documents".format(total_time_ingest_fields)) +print("{} to ingest flats documents".format(total_time_ingest_flat)) +print("{} to ingest flatint documents".format(total_time_ingest_flatint)) + +# Pick 500 random files and request both indexes +total_time_search_fields = 0 +total_time_search_flat = 0 +total_time_search_flatint = 0 + +random_images = random.choice(all_files, num_random).tolist() + +# Store all stats per msm {"1": {"same_first_flat": 489, +# "not_same_first_flat": [0, 0, 0], "same_first_flatint": 3, +# "not_same_first_flatint": [0, 0, 0]}} +stats_msm = {} + +for msm in range_msm: + ses_flat.minimum_should_match = msm + ses_flatint.minimum_should_match = msm + + same_first_flat = 0 # Number of time the first result is the same + not_same_first_flat = [0, 0, 0] # both not found, found in fields, in flat + same_first_flatint = 0 # Number of time the first result is the same + not_same_first_flatint = [0, 0, 0] # idem + + for image_path in random_images: + original_image = Image.open(image_path) + altered_path = "altered.jpg" + # altered_image = original_image.filter(ImageFilter.BLUR) + img_array_with_noise = noise_generator("s&p", np.array(original_image)) + altered_image = Image.fromarray(img_array_with_noise) + altered_image.save(altered_path) + image_path_to_search = altered_path + + t_search_fields = time.time() + res_fields = ses_fields.search_image(image_path_to_search) + total_time_search_fields += (time.time() - t_search_fields) + + t_search_flat = time.time() + res_flat = ses_flat.search_image(image_path_to_search) + total_time_search_flat += (time.time() - t_search_flat) + + t_search_flatint = time.time() + res_flatint = ses_flatint.search_image(image_path_to_search) + total_time_search_flatint += (time.time() - t_search_flatint) + + # FLAT analysis + # Precision of first result + same_first_flat_bool = False + if len(res_fields) > 0 and len(res_flat) > 0: + if res_fields[0]["path"] == res_flat[0]["path"]: + same_first_flat_bool = True + elif len(res_fields) == 0 and len(res_flat) == 0: + same_first_flat_bool = True # both fields and flat didn't find + + # When the first result is not the same, find out more details + if same_first_flat_bool: + same_first_flat += 1 + else: + pathes_fields = [res["path"] for res in res_fields] + [""] + pathes_flat = [res["path"] for res in res_flat] + [""] + if image_path not in pathes_fields and image_path not in pathes_flat: + not_same_first_flat[0] += 1 + elif image_path not in pathes_fields and pathes_flat[0] == image_path: + not_same_first_flat[2] += 1 + elif image_path not in pathes_flat and pathes_fields[0] == image_path: + not_same_first_flat[1] += 1 + + # FLATINT analysis + # Precision of first result + same_first_flatint_bool = False + if len(res_fields) > 0 and len(res_flatint) > 0: + if res_fields[0]["path"] == res_flatint[0]["path"]: + same_first_flatint_bool = True + elif len(res_fields) == 0 and len(res_flatint) == 0: + same_first_flatint_bool = True # both fields and flatint didn't find + + # When the first result is not the same, find out more details + if same_first_flatint_bool: + same_first_flatint += 1 + else: + pathes_fields = [res["path"] for res in res_fields] + [""] + pathes_flatint = [res["path"] for res in res_flatint] + [""] + if image_path not in pathes_fields and image_path not in pathes_flatint: + not_same_first_flatint[0] += 1 + elif image_path not in pathes_fields and pathes_flatint[0] == image_path: + not_same_first_flatint[2] += 1 + elif image_path not in pathes_flatint and pathes_fields[0] == image_path: + not_same_first_flatint[1] += 1 + + # Delete blurred image + os.remove(altered_path) + + stats_msm[str(msm)] = { + "same_first_flat": same_first_flat, + "not_same_first_flat": not_same_first_flat, + "same_first_flatint": same_first_flatint, + "not_same_first_flatint": not_same_first_flatint + } + + print("") + print("minimum_should_match = {}".format(msm)) + + print("--flat--") + print("{} same first results (out of {})".format(same_first_flat, num_random)) + print("When not same first results ({} cases)".format(sum(not_same_first_flat))) + print(". {} both wrong".format(not_same_first_flat[0])) + print(". {} found in fields but not in flat".format(not_same_first_flat[1])) + print(". {} found in flat but not in fields".format(not_same_first_flat[2])) + + print("--flatint--") + print("{} same first results (out of {})".format(same_first_flatint, num_random)) + print("When not same first results ({} cases)".format(sum(not_same_first_flatint))) + print(". {} both wrong".format(not_same_first_flatint[0])) + print(". {} found in fields but not in flatint".format(not_same_first_flatint[1])) + print(". {} found in flatint but not in fields".format(not_same_first_flatint[2])) + + +print(stats_msm) + +print("") +num_searches = num_random * max_msm +print("{} searches total".format(num_searches)) +print("{} to search fields documents".format(total_time_search_fields)) +print("{} to search flat documents".format(total_time_search_flat)) +print("{} to search flatint documents".format(total_time_search_flatint)) + +# ----------------------------------------------------------------------------- +# +# Draw plots in png files for further analysis +# +# ----------------------------------------------------------------------------- +# Typical stats_msm: +# {'1': {'same_first_flat': 499, 'not_same_first_flat': [0, 1, 0], +# 'same_first_flatint': 499, 'not_same_first_flatint': [0, 0, 0]}, +# '2': {'same_first_flat': 497, 'not_same_first_flat': [2, 1, 0], +# 'same_first_flatint': 494, 'not_same_first_flatint': [4, 1, 0]}, +# '3': {'same_first_flat': 496, 'not_same_first_flat': [1, 2, 0], +# 'same_first_flatint': 495, 'not_same_first_flatint': [1, 2, 0]}, +# '4': {'same_first_flat': 494, 'not_same_first_flat': [2, 2, 0], +# 'same_first_flatint': 495, 'not_same_first_flatint': [2, 2, 0]}, +# '5': {'same_first_flat': 488, 'not_same_first_flat': [6, 4, 0], +# 'same_first_flatint': 489, 'not_same_first_flatint': [6, 4, 0]}, +# '6': {'same_first_flat': 490, 'not_same_first_flat': [4, 5, 0], +# 'same_first_flatint': 489, 'not_same_first_flatint': [4, 5, 0]}} + +# Generate stat plot for ingestion +names = ["fields", "flat_txt", "flat_int"] +values_ingest = [total_time_ingest_fields, total_time_ingest_flat, total_time_ingest_flatint] +colors = ["red", "green", "blue"] +plt.xlabel("Ingestion Time (ms)") +plt.title("Average Ingestion Time ({} documents)".format(len(all_files))) +values_ingest_mean = [v / float(len(all_files)) for v in values_ingest] +plt.barh(names, values_ingest_mean, color=colors) +for i, v in enumerate(values_ingest_mean): + plt.text(v - .01, i + .25, "{:.4f} ms".format(v), color='black') +plt_file_name = 'plot_time_ingestion.png' +plt.savefig(plt_file_name) +plt.clf() +print("Save plot {}".format(plt_file_name)) + +# Size on disk +size_fields = es.indices.stats(INDEX_NAME_FIELDS)["indices"][INDEX_NAME_FIELDS]["total"]["store"]["size_in_bytes"] +size_flat = es.indices.stats(INDEX_NAME_FLAT)["indices"][INDEX_NAME_FLAT]["total"]["store"]["size_in_bytes"] +size_flatint = es.indices.stats(INDEX_NAME_FLATINT)["indices"][INDEX_NAME_FLATINT]["total"]["store"]["size_in_bytes"] +sizes = [size_fields, size_flat, size_flatint] +plt.xlabel("Size on disk (MB)") +plt.title("Size of ES index on disk") +values = [s/1024/1024 for s in sizes] +plt.barh(names, values, color=colors) +for i, v in enumerate(values): + plt.text(v - .01, i + .25, "{:.2f} MB".format(v), color='black') +plt_file_name = "plot_disk_usage.png" +plt.savefig(plt_file_name) +plt.clf() +print("Save plot {}".format(plt_file_name)) + +# Search Average Time +values_search = [total_time_search_fields, total_time_search_flat, total_time_search_flatint] +plt.xlabel("Search Time (ms)") +plt.title("Average Search Time ({} searches)".format(num_searches)) +values_search_mean = [v / float(len(all_files)) for v in values_search] +plt.barh(names, values_search_mean, color=colors) +for i, v in enumerate(values_search_mean): + plt.text(v - .01, i + .25, "{:.4f} ms".format(v), color='black') +plt_file_name = "plot_search_time.png" +plt.savefig(plt_file_name) +plt.clf() +print("Save plot {}".format(plt_file_name)) + +# Quantitative results + + +def draw_qualitative_plot(suffix): + """ + Draw a plot in a file from the stats_msm. + :param suffix: "flat" or "flatint" + :return: None + """ + names_msm = list(map(str, list(range_msm))) + same_first = [stats_msm[str(i_msm)]["same_first_" + suffix] for i_msm in list(range_msm)] + both_not_found = [stats_msm[str(i_msm)]["not_same_first_" + suffix][0] for i_msm in list(range_msm)] + found_in_fields = [stats_msm[str(i_msm)]["not_same_first_" + suffix][1] for i_msm in list(range_msm)] + found_in_flat = [stats_msm[str(i_msm)]["not_same_first_" + suffix][2] for i_msm in list(range_msm)] + fig, ax1 = plt.subplots() + + ax2 = ax1.twinx() + bar1 = ax2.bar(names_msm, both_not_found) + bar2 = ax2.bar(names_msm, found_in_fields, bottom=both_not_found) + bar3 = ax2.bar(names_msm, found_in_flat, bottom=found_in_fields) + maxval = max([sum([stats_msm[r]["not_same_first_" + suffix][idx_res] for r in names_msm]) + for idx_res in range(0, 3)]) + ax2.set_ylim(0, int(maxval + maxval*.75)) # Expand y limit of max histogram to have some space + + plot1 = ax1.plot(names_msm, same_first, "r-o") + minval = min([stats_msm[s]["same_first_" + suffix] for s in names_msm]) + ax1.set_ylim(int(500 - ((500 - minval) * 2.75)), 500) + + plt.xlabel("Minimum Should Match") + plt.ylabel("Hits") + plt.legend((plot1[0], bar1[0], bar2[0], bar3[0]), ('Same first result', 'Not found in both', + 'Found only in fields', 'Found only in ' + suffix)) + plt_file_name = 'plot_qualitative_{}.png'.format(suffix) + plt.savefig(plt_file_name) + plt.clf() + print("Save plot {}".format(plt_file_name)) + + +draw_qualitative_plot("flat") +draw_qualitative_plot("flatint") diff --git a/tests/test_elasticsearchflat_driver.py b/tests/test_elasticsearchflat_driver.py new file mode 100644 index 0000000..fae59de --- /dev/null +++ b/tests/test_elasticsearchflat_driver.py @@ -0,0 +1,15 @@ +import unittest +from image_match.elasticsearchflat_driver import SignatureES + +from tests.elasticsearch_helper import BaseTestsParent, DOC_TYPE, INDEX_NAME + + +class ElasticSearchFlatTestSuite(BaseTestsParent.BaseTests): + @property + def ses(self): + """ + Override the ses property to use the flat driver. + :return: SignatureES from image_match.elasticsearchflat_driver + """ + es = self.es + return SignatureES(es=es, index=INDEX_NAME, doc_type=DOC_TYPE)