diff --git a/DocumentFeatureSelection/__init__.py b/DocumentFeatureSelection/__init__.py index 63f9981..36ffe65 100644 --- a/DocumentFeatureSelection/__init__.py +++ b/DocumentFeatureSelection/__init__.py @@ -7,7 +7,7 @@ import sys python_version = sys.version_info -from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix +#from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix from DocumentFeatureSelection.pmi.PMI import PMI from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF from DocumentFeatureSelection.soa.soa import SOA diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index a04ebef..d3b0f11 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -57,8 +57,6 @@ def count_term_frequency_distribution(self, labeled_documents:AvailableInputType } # make list of distribution - #term_frequency_distribution_list = [0] * len(labeled_documents.keys()) - # TODO term_frequency_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in term_frequency_distribution.items(): @@ -82,8 +80,6 @@ def count_document_distribution(self, labeled_documents:AvailableInputTypes, lab } # make list of distribution - # TODO - #n_doc_distribution_list = [0] * len(labeled_documents.keys()) n_doc_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in n_doc_distribution.items(): @@ -93,10 +89,22 @@ def count_document_distribution(self, labeled_documents:AvailableInputTypes, lab return numpy.array(n_doc_distribution_list, dtype='i8') - def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'): - # type: (AvailableInputTypes, int, int, str) -> DataCsrMatrix - """This function makes TERM-frequency matrix for TF-IDF calculation. - TERM-frequency matrix is scipy.csr_matrix. + def labeledMultiDocs2TermFreqMatrix(self, + labeled_documents:AvailableInputTypes, + is_use_cache:bool=False, + is_use_memmap:bool=False, + path_working_dir:str=None, + joblib_backend:str='auto', + ngram:int=1, + n_jobs:int=1): + """* What you can do + - This function makes TERM-frequency matrix for TF-IDF calculation. + - TERM-frequency matrix is scipy.csr_matrix. + + * Params + - labeled_documents: Dict object which has category-name as key, and list of features as value + - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. + - path_working_dir: path to directory for saving cache files """ self.__check_data_structure(labeled_documents) @@ -123,13 +131,21 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, ) return DataCsrMatrix( - set_document_information.matrix_object, - set_document_information.label2id, - set_document_information.feature2id, - n_docs_distribution, term_frequency_distribution) + csr_matrix_=set_document_information.matrix_object, + label2id_dict=set_document_information.label2id, + vocabulary=set_document_information.feature2id, + n_docs_distribution=n_docs_distribution, + n_term_freq_distribution=term_frequency_distribution, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir + ) def labeledMultiDocs2DocFreqMatrix(self, labeled_documents:AvailableInputTypes, + is_use_cache:bool=False, + is_use_memmap:bool=False, + path_working_dir:str=None, ngram:int=1, n_jobs:int=1, joblib_backend:str='auto')->DataCsrMatrix: @@ -169,12 +185,15 @@ def labeledMultiDocs2DocFreqMatrix(self, label2id=set_document_information.label2id ) return DataCsrMatrix( - set_document_information.matrix_object, - set_document_information.label2id, - set_document_information.feature2id, - n_docs_distribution, term_frequency_distribution) - - + csr_matrix_=set_document_information.matrix_object, + label2id_dict=set_document_information.label2id, + vocabulary=set_document_information.feature2id, + n_docs_distribution=n_docs_distribution, + n_term_freq_distribution=term_frequency_distribution, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir + ) # ------------------------------------------------------------------------------------------------------------------- # function for output diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py index e10cb81..eff655e 100644 --- a/DocumentFeatureSelection/interface.py +++ b/DocumentFeatureSelection/interface.py @@ -23,11 +23,26 @@ def decide_joblib_strategy(feature2id_dict:Dict[str,int])->str: def run_feature_selection(input_dict:AvailableInputTypes, method:str, - ngram:int=1, - n_jobs:int=1, - joblib_backend='auto', + use_cython:bool=False, + is_use_cache:bool=False, + is_use_memmap:bool=False, + path_working_dir:str=None, matrix_form=None, - use_cython:bool=False)->ScoredResultObject: + joblib_backend='auto', + n_jobs:int=1, + ngram:int=1)->ScoredResultObject: + """A interface function of DocumentFeatureSelection package. + + * Parameters + - input_dict: Dict-object which has category-name as key and list of features as value. + You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict + - method: A method name of feature selection metric + - use_cython: boolean flag to use cython code for computation. It's much faster to use cython than native-python code + - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. + - is_use_memmap: boolean flag to use memmap for keeping matrix object. + - path_working_dir: str object. + The file path to directory where you save cache file or memmap matrix object. If you leave it None, it finds some directory and save files in it. + """ if not method in METHOD_NAMES: raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method)) @@ -39,7 +54,10 @@ def run_feature_selection(input_dict:AvailableInputTypes, labeled_documents=input_dict, ngram=ngram, n_jobs=n_jobs, - joblib_backend=joblib_backend + joblib_backend=joblib_backend, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir ) assert isinstance(matrix_data_object, DataCsrMatrix) @@ -53,7 +71,10 @@ def run_feature_selection(input_dict:AvailableInputTypes, labeled_documents=input_dict, ngram=ngram, n_jobs=n_jobs, - joblib_backend=joblib_backend + joblib_backend=joblib_backend, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir ) assert isinstance(matrix_data_object, DataCsrMatrix) if method == 'pmi': @@ -83,7 +104,10 @@ def run_feature_selection(input_dict:AvailableInputTypes, labeled_documents=input_dict, ngram=ngram, n_jobs=n_jobs, - joblib_backend=joblib_backend + joblib_backend=joblib_backend, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir ) assert isinstance(matrix_data_object, DataCsrMatrix) @@ -109,7 +133,11 @@ def run_feature_selection(input_dict:AvailableInputTypes, labeled_documents=input_dict, ngram=ngram, n_jobs=n_jobs, - joblib_backend=joblib_backend) + joblib_backend=joblib_backend, + is_use_cache=is_use_cache, + is_use_memmap=is_use_memmap, + path_working_dir=path_working_dir + ) assert isinstance(matrix_data_object, DataCsrMatrix) true_class_index = matrix_data_object.label2id_dict['positive'] diff --git a/DocumentFeatureSelection/models.py b/DocumentFeatureSelection/models.py index ca213b8..d9f98ba 100644 --- a/DocumentFeatureSelection/models.py +++ b/DocumentFeatureSelection/models.py @@ -2,7 +2,9 @@ from scipy.sparse.csr import csr_matrix from DocumentFeatureSelection.common import utils from numpy.core.multiarray import array, ndarray +from numpy import memmap from sqlitedict import SqliteDict +from tempfile import mkdtemp import pickle, json, csv, os, shutil # this class is from https://code.activestate.com/recipes/576642/ @@ -101,19 +103,58 @@ class DataCsrMatrix(object): csr_matrix is, sparse matrix from scipy.sparse """ - __slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution'] + __slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution', 'path_working_dir'] def __init__(self, csr_matrix_:csr_matrix, label2id_dict:Dict[str,int], vocabulary:Dict[str,int], n_docs_distribution:ndarray, - n_term_freq_distribution:ndarray): - self.csr_matrix_ = csr_matrix_ - self.label2id_dict = label2id_dict - self.vocabulary = vocabulary + n_term_freq_distribution:ndarray, + is_use_cache:bool=False, + is_use_memmap:bool=False, + path_working_dir:str=None): + self.n_docs_distribution = n_docs_distribution self.n_term_freq_distribution = n_term_freq_distribution + if path_working_dir is None: self.path_working_dir = mkdtemp() + else: self.path_working_dir = path_working_dir + + if is_use_cache: + """You use disk-drive for keeping object. + """ + path_vocabulary_cache_obj = os.path.join(self.path_working_dir, 'vocabulary.cache') + path_label_2_dict_cache_obj = os.path.join(self.path_working_dir, 'label_2_dict.cache') + self.vocabulary = self.initialize_cache_dict_object(path_vocabulary_cache_obj) + self.vocabulary = vocabulary + + self.label2id_dict = self.initialize_cache_dict_object(path_label_2_dict_cache_obj) + self.label2id_dict = label2id_dict + else: + """Keep everything on memory + """ + self.label2id_dict = label2id_dict + self.vocabulary = vocabulary + + if is_use_memmap: + """You use disk-drive for keeping object + """ + path_memmap_obj = os.path.join(self.path_working_dir, 'matrix.memmap') + self.csr_matrix_ = self.initialize_memmap_object(csr_matrix_, path_memmap_object=path_memmap_obj) + else: + self.csr_matrix_ = csr_matrix_ + + def initialize_cache_dict_object(self, path_cache_file): + return PersistentDict(path_cache_file, flag='c', format='json') + + def initialize_memmap_object(self, matrix_object:csr_matrix, path_memmap_object:str)->memmap: + fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape) + fp[:] = matrix_object.todense()[:] + return fp + def __str__(self): + return """matrix-type={}, matrix-size={}, path_working_dir={}""".format(type(self.csr_matrix_), + self.csr_matrix_.shape, + self.path_working_dir) class ScoredResultObject(object): def __init__(self, diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py index 92d0006..263b886 100644 --- a/DocumentFeatureSelection/pmi/PMI_python3.py +++ b/DocumentFeatureSelection/pmi/PMI_python3.py @@ -4,6 +4,8 @@ from __future__ import unicode_literals from __future__ import division from scipy.sparse import csr_matrix +from numpy import memmap +from typing import Union from logging import getLogger, StreamHandler import logging @@ -25,7 +27,7 @@ # http://sucrose.hatenablog.com/entry/2014/12/02/235959 -def pmi(X:csr_matrix, +def pmi(X:Union[csr_matrix, memmap], n_docs_distribution:numpy.ndarray, n_total_doc:int, feature_index:int, @@ -37,7 +39,7 @@ def pmi(X:csr_matrix, :param sample_index: :return: """ - assert isinstance(X, csr_matrix) + assert isinstance(X, (memmap, csr_matrix)) assert isinstance(n_docs_distribution, numpy.ndarray) assert isinstance(feature_index, int) assert isinstance(sample_index, int) @@ -79,7 +81,7 @@ class PMI(object): def __init__(self): pass - def fit_transform(self, X, + def fit_transform(self, X:Union[csr_matrix, memmap], n_docs_distribution, n_jobs=1, verbose=False, @@ -87,7 +89,7 @@ def fit_transform(self, X, use_cython:bool=False): """Main method of PMI class. """ - assert isinstance(X, csr_matrix) + assert isinstance(X, (memmap, csr_matrix)) assert isinstance(n_docs_distribution, numpy.ndarray) matrix_size = X.shape @@ -136,7 +138,8 @@ def fit_transform(self, X, return pmi_featured_csr_matrix - def docId_word_PMI(self, X:csr_matrix, + def docId_word_PMI(self, + X:Union[csr_matrix, memmap], n_docs_distribution:numpy.ndarray, n_total_doc:int, feature_index:int, diff --git a/DocumentFeatureSelection/soa/soa_python3.py b/DocumentFeatureSelection/soa/soa_python3.py index 1dddc7a..65c5899 100644 --- a/DocumentFeatureSelection/soa/soa_python3.py +++ b/DocumentFeatureSelection/soa/soa_python3.py @@ -1,4 +1,6 @@ from scipy.sparse import csr_matrix +from numpy import memmap +from typing import Union from logging import getLogger, StreamHandler import logging import joblib @@ -15,12 +17,13 @@ __author__ = 'kensuke-mi' -def soa(X:csr_matrix, unit_distribution:numpy.ndarray, +def soa(X:Union[memmap, csr_matrix], + unit_distribution:numpy.ndarray, n_total_docs:int, feature_index:int, sample_index:int, verbose=False): # X is either of term-frequency matrix per label or document-frequency per label - assert isinstance(X, csr_matrix) + assert isinstance(X, (memmap, csr_matrix)) assert isinstance(unit_distribution, numpy.ndarray) assert isinstance(feature_index, int) assert isinstance(sample_index, int) @@ -61,9 +64,14 @@ class SOA(object): def __init__(self): pass - def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, - joblib_backend='multiprocessing', use_cython:bool=False): - assert isinstance(X, csr_matrix) + def fit_transform(self, + X:Union[memmap, csr_matrix], + unit_distribution:numpy.ndarray, + n_jobs=1, + verbose=False, + joblib_backend='multiprocessing', + use_cython:bool=False): + assert isinstance(X, (memmap, csr_matrix)) assert isinstance(unit_distribution, numpy.ndarray) matrix_size = X.shape @@ -112,13 +120,14 @@ def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=Fa return soa_featured_csr_matrix - def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray, + def docId_word_soa(self, X:Union[memmap, csr_matrix], + unit_distribution:numpy.ndarray, n_total_doc:int, feature_index:int, sample_index:int, verbose=False): """ """ - assert isinstance(X, csr_matrix) + assert isinstance(X, (memmap, csr_matrix)) assert isinstance(unit_distribution, numpy.ndarray) assert isinstance(feature_index, int) assert isinstance(sample_index, int) diff --git a/README.md b/README.md index 5b43271..fdc6233 100644 --- a/README.md +++ b/README.md @@ -177,4 +177,10 @@ Removed a bug when calling n_gram method of DataConverter * You can put persisted-dict-object on disk-drive instead of dict-object on memory. * You can put huge dict object as data-source of `interface.run_feature_selection()` - * See example `examples/huge_data_example.py` \ No newline at end of file + * See example `examples/huge_data_example.py` + + +## 1.3.3 2016/11/30 + +* It introduced file-cache for keeping huge objects during computation. + \ No newline at end of file diff --git a/examples/huge_data_example.py b/examples/huge_data_example.py index 19c7925..524781b 100644 --- a/examples/huge_data_example.py +++ b/examples/huge_data_example.py @@ -31,10 +31,14 @@ persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() +# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation +# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation scored_matrix_obj = interface.run_feature_selection( input_dict=persistent_dict_obj, method='pmi', - use_cython=True + use_cython=True, + is_use_cache=True, + is_use_memmap=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..30c9363 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +nose +tox +joblib +typing \ No newline at end of file diff --git a/setup.py b/setup.py index 04ffe69..4ee37bb 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ """ __author__ = 'kensuke-mi' -__version__ = '1.3.2' +__version__ = '1.3.3' import sys import pip @@ -42,6 +42,7 @@ use_numpy_include_dirs = False try: pip.main(['install', 'numpy']) + import numpy except: raise Exception('We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution.') # -------------------------------------------------------------------------------------------------------- @@ -52,6 +53,7 @@ use_numpy_include_dirs = False try: pip.main(['install', 'scipy']) + import scipy except: raise Exception('We failed to install scipy automatically. Try installing scipy manually or Try anaconda distribution.') # -------------------------------------------------------------------------------------------------------- @@ -60,7 +62,8 @@ if python_version >= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy', - 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython', 'sqlitedict'] + 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython', 'sqlitedict', 'nose', + 'typing'] else: raise Exception('This package does NOT support Python2.x') diff --git a/test-requirements.txt b/test-requirements.txt new file mode 100644 index 0000000..dceffa5 --- /dev/null +++ b/test-requirements.txt @@ -0,0 +1,2 @@ +joblib +typing \ No newline at end of file diff --git a/tests/test_interface.py b/tests/test_interface.py index bd95c7d..ce3e5fc 100644 --- a/tests/test_interface.py +++ b/tests/test_interface.py @@ -29,7 +29,9 @@ def setUpClass(cls): ] } cls.method = ['pmi', 'tf_idf', 'soa'] - cls.bool_cython = [False, True] + cls.bool_cython = [True, False] + cls.is_use_cache = [True, False] + cls.is_use_memmap = [True, False] cls.joblib_range = range(0, 2) cls.path_shelve_file = './shelve' cls.path_sqlite3_persistent = './temp_db.sqlite3' @@ -39,6 +41,11 @@ def tearDownClass(cls): os.remove(cls.path_sqlite3_persistent) def test_interface_shelve(self): + """パラメタ条件を組み合わせてテストを実行する  + - cythonモード使う or not + - cacheモード使う or not + - memmapモード使う or not + """ shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json') for key, value in self.input_dict.items(): shelve_obj[key] = value @@ -47,27 +54,35 @@ def test_interface_shelve(self): for method_name in self.method: for cython_flag in self.bool_cython: - scored_result_persisted = interface.run_feature_selection( - input_dict=shelve_obj, - method=method_name, use_cython=cython_flag) # type: ScoredResultObject - self.assertIsInstance(scored_result_persisted, ScoredResultObject) - self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list) + for cache_flag in self.is_use_cache: + for memmap_flag in self.is_use_memmap: + scored_result_persisted = interface.run_feature_selection( + input_dict=shelve_obj, + method=method_name, + use_cython=cython_flag, + is_use_cache=cache_flag, + is_use_memmap=memmap_flag + ) # type: ScoredResultObject + self.assertIsInstance(scored_result_persisted, ScoredResultObject) + self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list) - scored_result_sqlite3_persisted = interface.run_feature_selection( - input_dict=sqlite3_dict_obj, - method=method_name, use_cython=cython_flag) # type: ScoredResultObject - self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) - self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list) + scored_result_sqlite3_persisted = interface.run_feature_selection( + input_dict=sqlite3_dict_obj, + method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject + self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) + self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list) - # You check if result is same between data-source = shelve_obj and data-source = dict-object - scored_result_dict = interface.run_feature_selection( - input_dict=self.input_dict, - method=method_name, use_cython=cython_flag) # type: ScoredResultObject - self.assertIsInstance(scored_result_dict, ScoredResultObject) - self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list) + # You check if result is same between data-source = shelve_obj and data-source = dict-object + scored_result_dict = interface.run_feature_selection( + input_dict=self.input_dict, + method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject + self.assertIsInstance(scored_result_dict, ScoredResultObject) + self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list) - numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) - numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) + numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(), + scored_result_dict.scored_matrix.toarray()) + numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(), + scored_result_dict.scored_matrix.toarray()) if __name__ == '__main__': unittest.main() diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..e833409 --- /dev/null +++ b/tox.ini @@ -0,0 +1,10 @@ +[tox] +envlist = py34,py35,py351,py352 + +[testenv] +deps = + -U + -r{toxinidir}/requirements.txt + -r{toxinidir}/test-requirements.txt +commands = + nosetests -v \ No newline at end of file