Skip to content

Commit

Permalink
Merge pull request #21 from Kensuke-Mitsuzawa/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
Kensuke-Mitsuzawa authored Nov 29, 2016
2 parents cbd2e17 + ed0d5d2 commit 26b40c0
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 67 deletions.
2 changes: 1 addition & 1 deletion DocumentFeatureSelection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
python_version = sys.version_info

from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix
#from DocumentFeatureSelection.common.data_converter import DataConverter, DataCsrMatrix
from DocumentFeatureSelection.pmi.PMI import PMI
from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF
from DocumentFeatureSelection.soa.soa import SOA
Expand Down
55 changes: 37 additions & 18 deletions DocumentFeatureSelection/common/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ def count_term_frequency_distribution(self, labeled_documents:AvailableInputType
}

# make list of distribution
#term_frequency_distribution_list = [0] * len(labeled_documents.keys())
# TODO
term_frequency_distribution_list = [0] * len(labeled_documents)

for label_string, n_doc in term_frequency_distribution.items():
Expand All @@ -82,8 +80,6 @@ def count_document_distribution(self, labeled_documents:AvailableInputTypes, lab
}

# make list of distribution
# TODO
#n_doc_distribution_list = [0] * len(labeled_documents.keys())
n_doc_distribution_list = [0] * len(labeled_documents)

for label_string, n_doc in n_doc_distribution.items():
Expand All @@ -93,10 +89,22 @@ def count_document_distribution(self, labeled_documents:AvailableInputTypes, lab

return numpy.array(n_doc_distribution_list, dtype='i8')

def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'):
# type: (AvailableInputTypes, int, int, str) -> DataCsrMatrix
"""This function makes TERM-frequency matrix for TF-IDF calculation.
TERM-frequency matrix is scipy.csr_matrix.
def labeledMultiDocs2TermFreqMatrix(self,
labeled_documents:AvailableInputTypes,
is_use_cache:bool=False,
is_use_memmap:bool=False,
path_working_dir:str=None,
joblib_backend:str='auto',
ngram:int=1,
n_jobs:int=1):
"""* What you can do
- This function makes TERM-frequency matrix for TF-IDF calculation.
- TERM-frequency matrix is scipy.csr_matrix.
* Params
- labeled_documents: Dict object which has category-name as key, and list of features as value
- is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
- path_working_dir: path to directory for saving cache files
"""
self.__check_data_structure(labeled_documents)

Expand All @@ -123,13 +131,21 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
)

return DataCsrMatrix(
set_document_information.matrix_object,
set_document_information.label2id,
set_document_information.feature2id,
n_docs_distribution, term_frequency_distribution)
csr_matrix_=set_document_information.matrix_object,
label2id_dict=set_document_information.label2id,
vocabulary=set_document_information.feature2id,
n_docs_distribution=n_docs_distribution,
n_term_freq_distribution=term_frequency_distribution,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)

def labeledMultiDocs2DocFreqMatrix(self,
labeled_documents:AvailableInputTypes,
is_use_cache:bool=False,
is_use_memmap:bool=False,
path_working_dir:str=None,
ngram:int=1,
n_jobs:int=1,
joblib_backend:str='auto')->DataCsrMatrix:
Expand Down Expand Up @@ -169,12 +185,15 @@ def labeledMultiDocs2DocFreqMatrix(self,
label2id=set_document_information.label2id
)
return DataCsrMatrix(
set_document_information.matrix_object,
set_document_information.label2id,
set_document_information.feature2id,
n_docs_distribution, term_frequency_distribution)


csr_matrix_=set_document_information.matrix_object,
label2id_dict=set_document_information.label2id,
vocabulary=set_document_information.feature2id,
n_docs_distribution=n_docs_distribution,
n_term_freq_distribution=term_frequency_distribution,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)

# -------------------------------------------------------------------------------------------------------------------
# function for output
Expand Down
44 changes: 36 additions & 8 deletions DocumentFeatureSelection/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,26 @@ def decide_joblib_strategy(feature2id_dict:Dict[str,int])->str:

def run_feature_selection(input_dict:AvailableInputTypes,
method:str,
ngram:int=1,
n_jobs:int=1,
joblib_backend='auto',
use_cython:bool=False,
is_use_cache:bool=False,
is_use_memmap:bool=False,
path_working_dir:str=None,
matrix_form=None,
use_cython:bool=False)->ScoredResultObject:
joblib_backend='auto',
n_jobs:int=1,
ngram:int=1)->ScoredResultObject:
"""A interface function of DocumentFeatureSelection package.
* Parameters
- input_dict: Dict-object which has category-name as key and list of features as value.
You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict
- method: A method name of feature selection metric
- use_cython: boolean flag to use cython code for computation. It's much faster to use cython than native-python code
- is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge.
- is_use_memmap: boolean flag to use memmap for keeping matrix object.
- path_working_dir: str object.
The file path to directory where you save cache file or memmap matrix object. If you leave it None, it finds some directory and save files in it.
"""
if not method in METHOD_NAMES:
raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method))

Expand All @@ -39,7 +54,10 @@ def run_feature_selection(input_dict:AvailableInputTypes,
labeled_documents=input_dict,
ngram=ngram,
n_jobs=n_jobs,
joblib_backend=joblib_backend
joblib_backend=joblib_backend,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)
assert isinstance(matrix_data_object, DataCsrMatrix)

Expand All @@ -53,7 +71,10 @@ def run_feature_selection(input_dict:AvailableInputTypes,
labeled_documents=input_dict,
ngram=ngram,
n_jobs=n_jobs,
joblib_backend=joblib_backend
joblib_backend=joblib_backend,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)
assert isinstance(matrix_data_object, DataCsrMatrix)
if method == 'pmi':
Expand Down Expand Up @@ -83,7 +104,10 @@ def run_feature_selection(input_dict:AvailableInputTypes,
labeled_documents=input_dict,
ngram=ngram,
n_jobs=n_jobs,
joblib_backend=joblib_backend
joblib_backend=joblib_backend,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)
assert isinstance(matrix_data_object, DataCsrMatrix)

Expand All @@ -109,7 +133,11 @@ def run_feature_selection(input_dict:AvailableInputTypes,
labeled_documents=input_dict,
ngram=ngram,
n_jobs=n_jobs,
joblib_backend=joblib_backend)
joblib_backend=joblib_backend,
is_use_cache=is_use_cache,
is_use_memmap=is_use_memmap,
path_working_dir=path_working_dir
)
assert isinstance(matrix_data_object, DataCsrMatrix)

true_class_index = matrix_data_object.label2id_dict['positive']
Expand Down
51 changes: 46 additions & 5 deletions DocumentFeatureSelection/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from scipy.sparse.csr import csr_matrix
from DocumentFeatureSelection.common import utils
from numpy.core.multiarray import array, ndarray
from numpy import memmap
from sqlitedict import SqliteDict
from tempfile import mkdtemp
import pickle, json, csv, os, shutil

# this class is from https://code.activestate.com/recipes/576642/
Expand Down Expand Up @@ -101,19 +103,58 @@ class DataCsrMatrix(object):
csr_matrix is, sparse matrix from scipy.sparse
"""

__slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution']
__slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution', 'path_working_dir']

def __init__(self, csr_matrix_:csr_matrix,
label2id_dict:Dict[str,int],
vocabulary:Dict[str,int],
n_docs_distribution:ndarray,
n_term_freq_distribution:ndarray):
self.csr_matrix_ = csr_matrix_
self.label2id_dict = label2id_dict
self.vocabulary = vocabulary
n_term_freq_distribution:ndarray,
is_use_cache:bool=False,
is_use_memmap:bool=False,
path_working_dir:str=None):

self.n_docs_distribution = n_docs_distribution
self.n_term_freq_distribution = n_term_freq_distribution
if path_working_dir is None: self.path_working_dir = mkdtemp()
else: self.path_working_dir = path_working_dir

if is_use_cache:
"""You use disk-drive for keeping object.
"""
path_vocabulary_cache_obj = os.path.join(self.path_working_dir, 'vocabulary.cache')
path_label_2_dict_cache_obj = os.path.join(self.path_working_dir, 'label_2_dict.cache')
self.vocabulary = self.initialize_cache_dict_object(path_vocabulary_cache_obj)
self.vocabulary = vocabulary

self.label2id_dict = self.initialize_cache_dict_object(path_label_2_dict_cache_obj)
self.label2id_dict = label2id_dict
else:
"""Keep everything on memory
"""
self.label2id_dict = label2id_dict
self.vocabulary = vocabulary

if is_use_memmap:
"""You use disk-drive for keeping object
"""
path_memmap_obj = os.path.join(self.path_working_dir, 'matrix.memmap')
self.csr_matrix_ = self.initialize_memmap_object(csr_matrix_, path_memmap_object=path_memmap_obj)
else:
self.csr_matrix_ = csr_matrix_

def initialize_cache_dict_object(self, path_cache_file):
return PersistentDict(path_cache_file, flag='c', format='json')

def initialize_memmap_object(self, matrix_object:csr_matrix, path_memmap_object:str)->memmap:
fp = memmap(path_memmap_object, dtype='float64', mode='w+', shape=matrix_object.shape)
fp[:] = matrix_object.todense()[:]
return fp

def __str__(self):
return """matrix-type={}, matrix-size={}, path_working_dir={}""".format(type(self.csr_matrix_),
self.csr_matrix_.shape,
self.path_working_dir)

class ScoredResultObject(object):
def __init__(self,
Expand Down
13 changes: 8 additions & 5 deletions DocumentFeatureSelection/pmi/PMI_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from __future__ import unicode_literals
from __future__ import division
from scipy.sparse import csr_matrix
from numpy import memmap
from typing import Union
from logging import getLogger, StreamHandler

import logging
Expand All @@ -25,7 +27,7 @@
# http://sucrose.hatenablog.com/entry/2014/12/02/235959


def pmi(X:csr_matrix,
def pmi(X:Union[csr_matrix, memmap],
n_docs_distribution:numpy.ndarray,
n_total_doc:int,
feature_index:int,
Expand All @@ -37,7 +39,7 @@ def pmi(X:csr_matrix,
:param sample_index:
:return:
"""
assert isinstance(X, csr_matrix)
assert isinstance(X, (memmap, csr_matrix))
assert isinstance(n_docs_distribution, numpy.ndarray)
assert isinstance(feature_index, int)
assert isinstance(sample_index, int)
Expand Down Expand Up @@ -79,15 +81,15 @@ class PMI(object):
def __init__(self):
pass

def fit_transform(self, X,
def fit_transform(self, X:Union[csr_matrix, memmap],
n_docs_distribution,
n_jobs=1,
verbose=False,
joblib_backend='multiprocessing',
use_cython:bool=False):
"""Main method of PMI class.
"""
assert isinstance(X, csr_matrix)
assert isinstance(X, (memmap, csr_matrix))
assert isinstance(n_docs_distribution, numpy.ndarray)

matrix_size = X.shape
Expand Down Expand Up @@ -136,7 +138,8 @@ def fit_transform(self, X,

return pmi_featured_csr_matrix

def docId_word_PMI(self, X:csr_matrix,
def docId_word_PMI(self,
X:Union[csr_matrix, memmap],
n_docs_distribution:numpy.ndarray,
n_total_doc:int,
feature_index:int,
Expand Down
23 changes: 16 additions & 7 deletions DocumentFeatureSelection/soa/soa_python3.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from scipy.sparse import csr_matrix
from numpy import memmap
from typing import Union
from logging import getLogger, StreamHandler
import logging
import joblib
Expand All @@ -15,12 +17,13 @@
__author__ = 'kensuke-mi'


def soa(X:csr_matrix, unit_distribution:numpy.ndarray,
def soa(X:Union[memmap, csr_matrix],
unit_distribution:numpy.ndarray,
n_total_docs:int,
feature_index:int,
sample_index:int, verbose=False):
# X is either of term-frequency matrix per label or document-frequency per label
assert isinstance(X, csr_matrix)
assert isinstance(X, (memmap, csr_matrix))
assert isinstance(unit_distribution, numpy.ndarray)
assert isinstance(feature_index, int)
assert isinstance(sample_index, int)
Expand Down Expand Up @@ -61,9 +64,14 @@ class SOA(object):
def __init__(self):
pass

def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False,
joblib_backend='multiprocessing', use_cython:bool=False):
assert isinstance(X, csr_matrix)
def fit_transform(self,
X:Union[memmap, csr_matrix],
unit_distribution:numpy.ndarray,
n_jobs=1,
verbose=False,
joblib_backend='multiprocessing',
use_cython:bool=False):
assert isinstance(X, (memmap, csr_matrix))
assert isinstance(unit_distribution, numpy.ndarray)

matrix_size = X.shape
Expand Down Expand Up @@ -112,13 +120,14 @@ def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=Fa
return soa_featured_csr_matrix


def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray,
def docId_word_soa(self, X:Union[memmap, csr_matrix],
unit_distribution:numpy.ndarray,
n_total_doc:int,
feature_index:int,
sample_index:int, verbose=False):
"""
"""
assert isinstance(X, csr_matrix)
assert isinstance(X, (memmap, csr_matrix))
assert isinstance(unit_distribution, numpy.ndarray)
assert isinstance(feature_index, int)
assert isinstance(sample_index, int)
Expand Down
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,10 @@ Removed a bug when calling n_gram method of DataConverter

* You can put persisted-dict-object on disk-drive instead of dict-object on memory.
* You can put huge dict object as data-source of `interface.run_feature_selection()`
* See example `examples/huge_data_example.py`
* See example `examples/huge_data_example.py`


## 1.3.3 2016/11/30

* It introduced file-cache for keeping huge objects during computation.

6 changes: 5 additions & 1 deletion examples/huge_data_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,14 @@
persistent_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
scored_matrix_obj = interface.run_feature_selection(
input_dict=persistent_dict_obj,
method='pmi',
use_cython=True
use_cython=True,
is_use_cache=True,
is_use_memmap=True
)
elapsed_time = time.time() - start
print ("elapsed_time with cython:{} [sec]".format(elapsed_time))
Expand Down
Loading

0 comments on commit 26b40c0

Please sign in to comment.