From 114c644143a1066dcf48868bf01369b6b1056903 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 21 Sep 2016 23:13:46 +0900
Subject: [PATCH 01/12] refactored PMI code

---
 .../common/crs_matrix_constructor.py          |  60 +++++++++++-------
 .../common/data_converter.py                  |  59 +++++++++--------
 .../common/labeledMultiDocs2labeledDocsSet.py |  40 +++++++-----
 DocumentFeatureSelection/common/utils.py      |  39 ++++++++----
 DocumentFeatureSelection/models.py            |  28 ++++----
 DocumentFeatureSelection/pmi/PMI_python3.py   |  20 ++++--
 examples/check_performance.py.lprof           | Bin 0 -> 265 bytes
 7 files changed, 150 insertions(+), 96 deletions(-)
 create mode 100644 examples/check_performance.py.lprof

diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py
index 7050b36..9d25eb0 100644
--- a/DocumentFeatureSelection/common/crs_matrix_constructor.py
+++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py
@@ -3,6 +3,8 @@
 import joblib
 import sys
 import logging
+import numpy
+from typing import List, Tuple
 from scipy.sparse import csr_matrix
 
 logging.basicConfig(format='%(asctime)s %(message)s',
@@ -15,37 +17,48 @@
 python_version = sys.version_info
 __author__ = 'kensuke-mi'
 
-PosTuple = namedtuple('PosTuple', ('doc_id', 'word_id', 'document_frequency'))
+
+class PosTuple(object):
+    __slots__ = ['doc_id', 'word_id', 'document_frequency']
+    def __init__(self, doc_id, word_id, document_frequency):
+        self.doc_id = doc_id
+        self.word_id = word_id
+        self.document_frequency = document_frequency
+
+
 PARAM_JOBLIB_BACKEND = ['multiprocessing', 'threading']
 
-def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary):
-    assert isinstance(vocaburary, dict)
-    try:
-        col_value = vocaburary[word]
-    except KeyError:
-        print()
+def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy.ndarray)->numpy.array:
+    """* what you can do
+     - You get array of [document_id, feature_id, value(frequency)]
+    """
+    assert isinstance(vocaburary, numpy.ndarray)
+    col_element = vocaburary[numpy.where(vocaburary['key']==word)]
+    assert len(col_element) == 1
+    col_value = col_element[0]['value']
     # df value is word frequency in documents
     df_value = doc_freq
 
-    return PosTuple(doc_id, col_value, df_value)
+    return numpy.array([doc_id, col_value, df_value])
+
+def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray:
+
+    value_pairs = numpy.array([
+        get_data_col_row_values(doc_id=doc_id, word=key_value_tuple[0], doc_freq=key_value_tuple[1], vocaburary=vocabulary)
+        for key_value_tuple
+        in doc_freq_obj])
 
-def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj, vocabulary):
-    value_pairs = [
-        get_data_col_row_values(doc_id=doc_id, word=word, doc_freq=freq, vocaburary=vocabulary)
-        for word, freq
-        in doc_freq_obj.items()
-        ]
-    assert isinstance(value_pairs, list)
     return value_pairs
 
-def make_csr_list(value_position_list):
+
+def make_csr_list(value_position_list:List[numpy.array])->Tuple[List[int], List[int], List[int]]:
     data = []
     row = []
     col = []
     for position_tuple in value_position_list:
-        row.append(position_tuple.doc_id)
-        col.append(position_tuple.word_id)
-        data.append(position_tuple.document_frequency)
+        row.append(position_tuple[0])
+        col.append(position_tuple[1])
+        data.append(position_tuple[2])
 
     return row, col, data
 
@@ -74,7 +87,7 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back
         assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend))
 
     assert isinstance(feature_frequency, list)
-    assert isinstance(vocabulary, dict)
+    assert isinstance(vocabulary, (numpy.ndarray, numpy.array))
     assert isinstance(n_jobs, int)
 
     logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs))
@@ -86,11 +99,12 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back
             vocabulary
         )
         for doc_id, doc_freq_obj in enumerate(feature_frequency)
-    )
+    )  # type: List[numpy.ndarray]
+
+    # make 2-d list into 1-d list
     value_position_list = sorted(
             [l for set in set_value_position_list for l in set],
-        key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2])
-    )
+        key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2]))
 
     row, col, data = make_csr_list(value_position_list)
 
diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py
index 38c4eef..6b05cdb 100644
--- a/DocumentFeatureSelection/common/data_converter.py
+++ b/DocumentFeatureSelection/common/data_converter.py
@@ -12,6 +12,8 @@
 from DocumentFeatureSelection import init_logger
 import logging
 import sys
+import numpy
+import pickle
 from typing import Dict, List, Tuple, Union, Any
 python_version = sys.version_info
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
@@ -66,11 +68,11 @@ def __check_data_structure(self, labeled_documents):
 
         return True
 
-    def count_term_frequency_distribution(self, labeled_documents, label2id_dict):
+    def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array):
         """Count term-distribution per label.
         """
         assert isinstance(labeled_documents, dict)
-        assert isinstance(label2id_dict, dict)
+        assert isinstance(label2id, numpy.ndarray)
 
         # count total term-frequency per label
         term_frequency_distribution = {
@@ -83,11 +85,12 @@ def count_term_frequency_distribution(self, labeled_documents, label2id_dict):
         term_frequency_distribution_list = [0] * len(labeled_documents.keys())
 
         for label_string, n_doc in term_frequency_distribution.items():
-            term_frequency_distribution_list[label2id_dict[label_string]] = n_doc
+            term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            term_frequency_distribution_list[term_index] = n_doc
 
-        return term_frequency_distribution_list
+        return numpy.array(term_frequency_distribution_list, dtype='i8')
 
-    def count_document_distribution(self, labeled_documents, label2id_dict):
+    def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array)->numpy.ndarray:
         """This method count n(docs) per label.
 
         :param labeled_documents:
@@ -95,7 +98,7 @@ def count_document_distribution(self, labeled_documents, label2id_dict):
         :return:
         """
         assert isinstance(labeled_documents, dict)
-        assert isinstance(label2id_dict, dict)
+        assert isinstance(label2id, numpy.ndarray)
 
         # count n(docs) per label
         n_doc_distribution = {
@@ -108,9 +111,10 @@ def count_document_distribution(self, labeled_documents, label2id_dict):
         n_doc_distribution_list = [0] * len(labeled_documents.keys())
 
         for label_string, n_doc in n_doc_distribution.items():
-            n_doc_distribution_list[label2id_dict[label_string]] = n_doc
+            docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            n_doc_distribution_list[docs_index] = n_doc
 
-        return n_doc_distribution_list
+        return numpy.array(n_doc_distribution_list, dtype='i8')
 
     def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'):
         """This function makes TERM-frequency matrix for TF-IDF calculation.
@@ -138,39 +142,40 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
         # make set of tuples to construct csr_matrix
         row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
             feature_frequency=set_document_information.feature_frequency,
-            vocabulary=set_document_information.feature2id_dict,
+            vocabulary=set_document_information.feature2id,
             n_jobs=n_jobs,
             joblib_backend=joblib_backend
         )
         logger.debug(msg='Finished pre-processing before CSR matrix')
         csr_matrix_ = crs_matrix_constructor.make_csr_objects(
                 row=row, col=col, data=data,
-                n_feature=max(set_document_information.feature2id_dict.values())+1,
+                n_feature=max(set_document_information.feature2id.values())+1,
                 n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
         n_docs_distribution = self.count_document_distribution(
             labeled_documents=labeled_documents,
-            label2id_dict=set_document_information.label2id_dict
+            label2id=set_document_information.label2id
         )
         # count term-frequency per label
         term_frequency_distribution = self.count_term_frequency_distribution(
             labeled_documents=labeled_documents,
-            label2id_dict=set_document_information.label2id_dict
+            label2id=set_document_information.label2id
         )
 
         assert isinstance(csr_matrix_, csr_matrix)
-        assert isinstance(set_document_information.label2id_dict, dict)
-        assert isinstance(set_document_information.feature2id_dict, dict)
+        assert isinstance(set_document_information.label2id, dict)
+        assert isinstance(set_document_information.label2id, dict)
         assert isinstance(n_docs_distribution, list)
         return DataCsrMatrix(
                 csr_matrix_,
-                set_document_information.label2id_dict,
-                set_document_information.feature2id_dict,
+                set_document_information.label2id,
+                set_document_information.feature2id,
                 n_docs_distribution, term_frequency_distribution)
 
 
-    def labeledMultiDocs2DocFreqMatrix(self, labeled_documents,
+    def labeledMultiDocs2DocFreqMatrix(self,
+                                       labeled_documents:Dict[str,List[Any]],
                                        ngram:int=1,
                                        n_jobs:int=1,
                                        joblib_backend:str='auto')->DataCsrMatrix:
@@ -229,7 +234,7 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents,
                                                                                          n_jobs=n_jobs,
                                                                                          joblib_backend=joblib_backend)
         assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation)
-        logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id_dict)))
+        logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id)))
         if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000:
             joblib_backend = 'threading'
         if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000:
@@ -238,35 +243,35 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents,
         # make set of tuples to construct csr_matrix
         row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
             feature_frequency=set_document_information.feature_frequency,
-            vocabulary=set_document_information.feature2id_dict,
+            vocabulary=set_document_information.feature2id,
             n_jobs=n_jobs,
             joblib_backend=joblib_backend
         )
         logger.debug(msg='Finished pre-processing before CSR matrix')
         csr_matrix_ = crs_matrix_constructor.make_csr_objects(
                 row=row, col=col, data=data,
-                n_feature=max(set_document_information.feature2id_dict.values())+1,
+                n_feature=len(set_document_information.feature2id)+1,
                 n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
         n_docs_distribution = self.count_document_distribution(
             labeled_documents=labeled_documents,
-            label2id_dict=set_document_information.label2id_dict
+            label2id=set_document_information.label2id
         )
         # count term-frequency per label
         term_frequency_distribution = self.count_term_frequency_distribution(
             labeled_documents=labeled_documents,
-            label2id_dict=set_document_information.label2id_dict
+            label2id=set_document_information.label2id
         )
 
         assert isinstance(csr_matrix_, csr_matrix)
-        assert isinstance(set_document_information.label2id_dict, dict)
-        assert isinstance(set_document_information.feature2id_dict, dict)
-        assert isinstance(n_docs_distribution, list)
+        assert isinstance(set_document_information.label2id, numpy.ndarray)
+        assert isinstance(set_document_information.feature2id, numpy.ndarray)
+        assert isinstance(n_docs_distribution, numpy.ndarray)
         return DataCsrMatrix(
                 csr_matrix_,
-                set_document_information.label2id_dict,
-                set_document_information.feature2id_dict,
+                set_document_information.label2id,
+                set_document_information.feature2id,
                 n_docs_distribution, term_frequency_distribution)
 
 
diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
index d015991..c7c6b8c 100644
--- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
+++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
@@ -6,11 +6,13 @@
 from typing import Dict, List, Tuple, Any, Union
 import logging
 import joblib
+import numpy
+import pickle
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
 N_FEATURE_SWITCH_STRATEGY = 1000000
 
 def generate_document_dict(document_key:str,
-                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[str, int]]:
+                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[Union[str,bytes], int]]:
     """This function gets Document-frequency count in given list of documents
     """
     assert isinstance(documents, list)
@@ -18,12 +20,6 @@ def generate_document_dict(document_key:str,
     document_frequencies = Counter()
     for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys())
     document_frequency_dict = dict(document_frequencies)
-    '''
-    V = set([t for d in documents for t in d])
-    document_frequency_dict = {}
-    for v in V:
-        binary_count = [1 for d in documents if v in d]
-        document_frequency_dict[v] = sum(binary_count)'''
 
     assert isinstance(document_frequency_dict, dict)
     return (document_key, document_frequency_dict)
@@ -86,22 +82,25 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
     if type_flag == set(['str']):
         feature_list = list(set(utils.flatten(labeled_documents.values())))
         feature_list = sorted(feature_list)
+        max_lenght = max([len(s) for s in feature_list])
     elif type_flag == set(['tuple']):
+        # make tuple into string
         feature_list = list(set(utils.flatten(labeled_documents.values())))
-        feature_list = sorted(feature_list)
+        feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)]
+        max_lenght = max([len(s) for s in feature_list]) + 10
     else:
         raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag))
 
-    feature2id_dict = {t: index for index, t in enumerate(feature_list)}
+    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: array
 
     # make label: id dictionary structure
     label2id_dict = {}
-    # make list of document-frequency
+    # list of document-frequency array
     feature_frequency = []
 
-    if joblib_backend == 'auto' and len(feature2id_dict) >= N_FEATURE_SWITCH_STRATEGY:
+    if joblib_backend == 'auto' and len(feature2id) >= N_FEATURE_SWITCH_STRATEGY:
         joblib_backend = 'threading'
-    if joblib_backend == 'auto' and len(feature2id_dict) < N_FEATURE_SWITCH_STRATEGY:
+    if joblib_backend == 'auto' and len(feature2id) < N_FEATURE_SWITCH_STRATEGY:
         joblib_backend = 'multiprocessing'
 
     counted_frequency = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
@@ -112,6 +111,17 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
     for doc_key_freq_tuple in counted_frequency:
         label2id_dict.update({doc_key_freq_tuple[0]: document_index})
         document_index += 1
-        feature_frequency.append(doc_key_freq_tuple[1])
-
-    return SetDocumentInformation(feature_frequency, label2id_dict, feature2id_dict)
\ No newline at end of file
+        if type_flag == set(['str']):
+            doc_freq = doc_key_freq_tuple[1]
+        elif type_flag == set(['tuple']):
+            doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())}
+        else:
+            raise Exception()
+        feature_frequency.append(
+            numpy.array(
+                list(doc_freq.items()),
+                dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')]
+            ))
+    label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10
+    label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')])
+    return SetDocumentInformation(feature_frequency, label2id, feature2id)
\ No newline at end of file
diff --git a/DocumentFeatureSelection/common/utils.py b/DocumentFeatureSelection/common/utils.py
index 1a0ba52..02e3727 100644
--- a/DocumentFeatureSelection/common/utils.py
+++ b/DocumentFeatureSelection/common/utils.py
@@ -9,7 +9,10 @@
 import logging
 import collections
 import joblib
+import typing
+import numpy
 import sys
+import pickle
 python_version = sys.version_info
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
 
@@ -53,7 +56,7 @@ def __get_value_index(row_index, column_index, weight_csr_matrix, verbose=False)
     return value
 
 
-def make_non_zero_information(weight_csr_matrix):
+def make_non_zero_information(weight_csr_matrix:csr_matrix):
     """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple.
 
     :param weight_csr_matrix:
@@ -80,19 +83,31 @@ def make_non_zero_information(weight_csr_matrix):
 
 def get_label(row_col_val_tuple, label_id):
     assert isinstance(row_col_val_tuple, ROW_COL_VAL)
-    assert isinstance(label_id, dict)
+    assert isinstance(label_id, numpy.ndarray)
+
+    label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value']
+    try:
+        original_label = pickle.loads(label)
+    except (pickle.UnpicklingError, KeyError):
+        original_label = label.decode('utf-8')
 
-    return label_id[row_col_val_tuple.row]
+    return original_label
 
 
 def get_word(row_col_val_tuple, vocabulary):
     assert isinstance(row_col_val_tuple, ROW_COL_VAL)
-    assert isinstance(vocabulary, dict)
+    assert isinstance(vocabulary, numpy.ndarray)
 
-    return vocabulary[row_col_val_tuple.col]
+    vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value']
+    try:
+        original_vocab = pickle.loads(vocab)
+    except (pickle.UnpicklingError, KeyError):
+        original_vocab = vocab.decode('utf-8')
 
+    return original_vocab
 
-def SUB_FUNC_feature_extraction(row_col_val_tuple, id2label, id2vocab):
+
+def SUB_FUNC_feature_extraction(row_col_val_tuple:typing.Tuple[int,int,int], id2label:numpy.ndarray, id2vocab:numpy.ndarray):
     """This function returns PMI score between label and words.
 
     Input csr matrix must be 'document-frequency' matrix, where records #document that word appears in document set.
@@ -139,19 +154,16 @@ def get_feature_dictionary(weighted_matrix, vocabulary, label_group_dict, n_jobs
     :param bool cut_zero: return all result or not. If cut_zero = True, the method cuts zero features.
     """
     assert isinstance(weighted_matrix, csr_matrix)
-    assert isinstance(vocabulary, dict)
-    assert isinstance(label_group_dict, dict)
+    assert isinstance(vocabulary, numpy.ndarray)
+    assert isinstance(label_group_dict, numpy.ndarray)
     assert isinstance(n_jobs, int)
 
     logger.debug(msg='Start making scored dictionary object from scored matrix')
     logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1]))
 
     value_index_items = make_non_zero_information(weighted_matrix)
-    id2label = {id:label for label, id in label_group_dict.items()}
-    if python_version > (3, 0, 0):
-        id2vocab = {id:voc for voc, id in vocabulary.items()}
-    else:
-        id2vocab = {id:voc for voc, id in vocabulary.viewitems()}
+    id2label = numpy.array([(element['value'], element['key']) for element in label_group_dict], dtype=[('key', '<i'), ('value', label_group_dict.dtype['key'])])
+    id2vocab = numpy.array([(element['value'], element['key']) for element in vocabulary], dtype=[('key', '<i'), ('value', vocabulary.dtype['key'])])
 
     score_objects = joblib.Parallel(n_jobs=n_jobs)(
         joblib.delayed(SUB_FUNC_feature_extraction)(
@@ -164,7 +176,6 @@ def get_feature_dictionary(weighted_matrix, vocabulary, label_group_dict, n_jobs
 
     logger.debug(msg='Finished making scored dictionary')
 
-
     return score_objects
 
 
diff --git a/DocumentFeatureSelection/models.py b/DocumentFeatureSelection/models.py
index f8a246e..f3246ab 100644
--- a/DocumentFeatureSelection/models.py
+++ b/DocumentFeatureSelection/models.py
@@ -1,24 +1,28 @@
 from typing import Dict, List, Tuple, Union, Any, TypeVar
 from scipy.sparse.csr import csr_matrix
 from DocumentFeatureSelection.common import utils
+from numpy.core.multiarray import array, ndarray
 FeatureType = TypeVar('T', str, Tuple[Any])
 
-
 class SetDocumentInformation(object):
-    def __init__(self, feature_frequency:Dict[FeatureType,int],
-                 label2id_dict:Dict[str,int],
-                 feature2id_dict:Dict[FeatureType, int]):
+    __slots__ = ['feature_frequency', 'label2id', 'feature2id']
+
+    def __init__(self, feature_frequency:List[array],
+                 label2id:ndarray,
+                 feature2id:ndarray):
         self.feature_frequency = feature_frequency
-        self.label2id_dict = label2id_dict
-        self.feature2id_dict = feature2id_dict
+        self.label2id = label2id
+        self.feature2id = feature2id
 
 
 class DataCsrMatrix(object):
+    __slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution']
+
     def __init__(self, csr_matrix_:csr_matrix,
-                 label2id_dict:Dict[str, int],
-                 vocabulary:Dict[FeatureType, int],
-                 n_docs_distribution:List[int],
-                 n_term_freq_distribution:List[int]):
+                 label2id_dict:array,
+                 vocabulary:ndarray,
+                 n_docs_distribution:array,
+                 n_term_freq_distribution:array):
         self.csr_matrix_ = csr_matrix_
         self.label2id_dict = label2id_dict
         self.vocabulary = vocabulary
@@ -29,8 +33,8 @@ def __init__(self, csr_matrix_:csr_matrix,
 class ScoredResultObject(object):
     def __init__(self,
                  scored_matrix:csr_matrix,
-                 label2id_dict:Dict[str,int],
-                 feature2id_dict=Dict[FeatureType,int],
+                 label2id_dict:ndarray,
+                 feature2id_dict=ndarray,
                  method:str=None,
                  matrix_form:str=None):
         self.scored_matrix = scored_matrix
diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py
index 7fe039a..c5760ee 100644
--- a/DocumentFeatureSelection/pmi/PMI_python3.py
+++ b/DocumentFeatureSelection/pmi/PMI_python3.py
@@ -8,6 +8,7 @@
 import logging
 import joblib
 import math
+import numpy
 
 logging.basicConfig(format='%(asctime)s %(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p',
@@ -30,7 +31,7 @@ def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_
         """Main method of PMI class.
         """
         assert isinstance(X, csr_matrix)
-        assert isinstance(n_docs_distribution, list)
+        assert isinstance(n_docs_distribution, numpy.ndarray)
 
         matrix_size = X.shape
         sample_range = list(range(0, matrix_size[0]))
@@ -65,7 +66,12 @@ def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_
 
         return pmi_featured_csr_matrix
 
-    def docId_word_PMI(self, X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose=False):
+    def docId_word_PMI(self, X:csr_matrix,
+                       n_docs_distribution:numpy.ndarray,
+                       n_total_doc:int,
+                       feature_index:int,
+                       sample_index:int,
+                       verbose=False):
         """Calculate PMI score for fit_format()
 
         :param X:
@@ -76,7 +82,7 @@ def docId_word_PMI(self, X, n_docs_distribution, n_total_doc, feature_index, sam
         :return:
         """
         assert isinstance(X, csr_matrix)
-        assert isinstance(n_docs_distribution, list)
+        assert isinstance(n_docs_distribution, numpy.ndarray)
         assert isinstance(feature_index, int)
         assert isinstance(sample_index, int)
 
@@ -90,7 +96,11 @@ def docId_word_PMI(self, X, n_docs_distribution, n_total_doc, feature_index, sam
         )
         return sample_index, feature_index, pmi_score
 
-    def pmi(self, X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose=False):
+    def pmi(self, X:csr_matrix,
+            n_docs_distribution:numpy.ndarray,
+            n_total_doc:int,
+            feature_index:int,
+            sample_index:int, verbose=False):
         """get PMI score for given feature & sample index
 
         :param X:
@@ -99,7 +109,7 @@ def pmi(self, X, n_docs_distribution, n_total_doc, feature_index, sample_index,
         :return:
         """
         assert isinstance(X, csr_matrix)
-        assert isinstance(n_docs_distribution, list)
+        assert isinstance(n_docs_distribution, numpy.ndarray)
         assert isinstance(feature_index, int)
         assert isinstance(sample_index, int)
 
diff --git a/examples/check_performance.py.lprof b/examples/check_performance.py.lprof
new file mode 100644
index 0000000000000000000000000000000000000000..c5b70050d8700da80b4d75cb131c4382e00f9d8d
GIT binary patch
literal 265
zcmZo*nfi|b0($u3b29T%;|q%N(=u~Xi>CB&`T+UCC5a`)QzlQ*Y@AX%MWctkBr`WN
zFTEJd5lPNSP0o%lNG(dsFUn2KOHS1*sGQ<0-Xm0yn;Bo8S&|W7l2MeJm;%%>rF}~5
z6b)}lZ$|IK?Nhv^fHVt`mIl&{Kw1V$%R*^6Z$>Z2n?FU{r=(`Ed-L=N!c8nlEK1DD
zNzH*8$OqJL4QK#ARHFcpW&rX9p|lWGTo`EJ#pGBCpn;`5ETwswB~#q(HXQrDVC&o7
H(j+|qmi}M1

literal 0
HcmV?d00001


From fdb3c488b0c1741f0058f82e0c53be272352a670 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 21 Sep 2016 23:38:03 +0900
Subject: [PATCH 02/12] refactored tf-idf

---
 .../common/crs_matrix_constructor.py          |  4 +-
 .../common/data_converter.py                  |  8 ++--
 .../common/labeledMultiDocs2labeledDocsSet.py | 39 +++++++++++++++----
 DocumentFeatureSelection/soa/soa_python3.py   | 19 ++++++---
 examples/check_performance.py                 |  4 +-
 5 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py
index 9d25eb0..f599c7f 100644
--- a/DocumentFeatureSelection/common/crs_matrix_constructor.py
+++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py
@@ -44,7 +44,7 @@ def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy
 def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray:
 
     value_pairs = numpy.array([
-        get_data_col_row_values(doc_id=doc_id, word=key_value_tuple[0], doc_freq=key_value_tuple[1], vocaburary=vocabulary)
+        get_data_col_row_values(doc_id=doc_id, word=key_value_tuple['key'], doc_freq=key_value_tuple['value'], vocaburary=vocabulary)
         for key_value_tuple
         in doc_freq_obj])
 
@@ -87,7 +87,7 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back
         assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend))
 
     assert isinstance(feature_frequency, list)
-    assert isinstance(vocabulary, (numpy.ndarray, numpy.array))
+    assert isinstance(vocabulary, numpy.ndarray)
     assert isinstance(n_jobs, int)
 
     logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs))
diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py
index 6b05cdb..986711a 100644
--- a/DocumentFeatureSelection/common/data_converter.py
+++ b/DocumentFeatureSelection/common/data_converter.py
@@ -149,7 +149,7 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
         logger.debug(msg='Finished pre-processing before CSR matrix')
         csr_matrix_ = crs_matrix_constructor.make_csr_objects(
                 row=row, col=col, data=data,
-                n_feature=max(set_document_information.feature2id.values())+1,
+                n_feature=len(set_document_information.feature2id)+1,
                 n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
@@ -164,9 +164,9 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
         )
 
         assert isinstance(csr_matrix_, csr_matrix)
-        assert isinstance(set_document_information.label2id, dict)
-        assert isinstance(set_document_information.label2id, dict)
-        assert isinstance(n_docs_distribution, list)
+        assert isinstance(set_document_information.label2id, numpy.ndarray)
+        assert isinstance(set_document_information.label2id, numpy.ndarray)
+        assert isinstance(n_docs_distribution, numpy.ndarray)
         return DataCsrMatrix(
                 csr_matrix_,
                 set_document_information.label2id,
diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
index c7c6b8c..267067c 100644
--- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
+++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
@@ -35,8 +35,20 @@ def multiDocs2TermFreqInfo(labeled_documents):
 
     vocabulary_list = list(set(utils.flatten(labeled_documents.values())))
     vocabulary_list = sorted(vocabulary_list)
+    type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
+    if type_flag == set(['str']):
+        feature_list = list(set(utils.flatten(labeled_documents.values())))
+        feature_list = sorted(feature_list)
+        max_lenght = max([len(s) for s in feature_list])
+    elif type_flag == set(['tuple']):
+        # make tuple into string
+        feature_list = list(set(utils.flatten(labeled_documents.values())))
+        feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)]
+        max_lenght = max([len(s) for s in feature_list]) + 10
+    else:
+        raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag))
 
-    vocaburary2id_dict = {t: index for index, t in enumerate(vocabulary_list)}
+    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: ndarray
 
     # make label: id dictionary structure
     label2id_dict = {}
@@ -45,15 +57,28 @@ def multiDocs2TermFreqInfo(labeled_documents):
     document_index = 0
 
     for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]):
-        words_in_docs = utils.flatten(docs)
-        feature_frequency.append(dict(Counter(words_in_docs)))
         label2id_dict.update({key: document_index})
         document_index += 1
+        words_in_docs = utils.flatten(docs)
+        if type_flag == set(['str']):
+            term_freq = Counter(words_in_docs)
+        elif type_flag == set(['tuple']):
+            term_freq = {pickle.dumps(key): value for key,value in dict(Counter(words_in_docs)).items()}
+        else:
+            raise Exception()
+
+        feature_frequency.append(
+            numpy.array(
+                [(index_tuple[1], index_tuple[0]) for index_tuple in enumerate(term_freq)],
+                dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')]
+            ))
 
-    assert isinstance(vocaburary2id_dict, dict)
+    label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10
+    label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')])
+    assert isinstance(feature2id, numpy.ndarray)
     assert isinstance(feature_frequency, list)
-    assert isinstance(label2id_dict, dict)
-    return SetDocumentInformation(feature_frequency, label2id_dict, vocaburary2id_dict)
+    assert isinstance(label2id, numpy.ndarray)
+    return SetDocumentInformation(feature_frequency, label2id, feature2id)
 
 
 def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str:
@@ -91,7 +116,7 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
     else:
         raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag))
 
-    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: array
+    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: ndarray
 
     # make label: id dictionary structure
     label2id_dict = {}
diff --git a/DocumentFeatureSelection/soa/soa_python3.py b/DocumentFeatureSelection/soa/soa_python3.py
index d4066bd..82776b0 100644
--- a/DocumentFeatureSelection/soa/soa_python3.py
+++ b/DocumentFeatureSelection/soa/soa_python3.py
@@ -3,6 +3,7 @@
 import logging
 import joblib
 import math
+import numpy
 
 logging.basicConfig(format='%(asctime)s %(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p',
@@ -18,9 +19,9 @@ class SOA(object):
     def __init__(self):
         pass
 
-    def fit_transform(self, X, unit_distribution, n_jobs=1, verbose=False, joblib_backend='multiprocessing'):
+    def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, joblib_backend='multiprocessing'):
         assert isinstance(X, csr_matrix)
-        assert isinstance(unit_distribution, list)
+        assert isinstance(unit_distribution, numpy.ndarray)
 
         matrix_size = X.shape
         sample_range = list(range(0, matrix_size[0]))
@@ -56,11 +57,14 @@ def fit_transform(self, X, unit_distribution, n_jobs=1, verbose=False, joblib_ba
         return soa_featured_csr_matrix
 
 
-    def docId_word_soa(self, X, unit_distribution, n_total_doc, feature_index, sample_index, verbose=False):
+    def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray,
+                       n_total_doc:int,
+                       feature_index:int,
+                       sample_index:int, verbose=False):
         """
         """
         assert isinstance(X, csr_matrix)
-        assert isinstance(unit_distribution, list)
+        assert isinstance(unit_distribution, numpy.ndarray)
         assert isinstance(feature_index, int)
         assert isinstance(sample_index, int)
 
@@ -74,10 +78,13 @@ def docId_word_soa(self, X, unit_distribution, n_total_doc, feature_index, sampl
         )
         return sample_index, feature_index, soa_score
 
-    def soa(self, X, unit_distribution, n_total_docs, feature_index, sample_index, verbose=False):
+    def soa(self, X:csr_matrix, unit_distribution:numpy.ndarray,
+            n_total_docs:int,
+            feature_index:int,
+            sample_index:int, verbose=False):
         # X is either of term-frequency matrix per label or document-frequency per label
         assert isinstance(X, csr_matrix)
-        assert isinstance(unit_distribution, list)
+        assert isinstance(unit_distribution, numpy.ndarray)
         assert isinstance(feature_index, int)
         assert isinstance(sample_index, int)
 
diff --git a/examples/check_performance.py b/examples/check_performance.py
index 7286412..c45cd00 100644
--- a/examples/check_performance.py
+++ b/examples/check_performance.py
@@ -9,7 +9,7 @@
 logger.level = logging.DEBUG
 
 
-@profile
+#@profile
 def pmi_with_parallel(input_corpus):
     logging.debug(msg='With multiprocessing backend')
     scored_matrix_obj = interface.run_feature_selection(
@@ -20,7 +20,7 @@ def pmi_with_parallel(input_corpus):
     )
 
 
-@profile
+#@profile
 def pmi_with_threading(input_corpus):
     logging.debug(msg='With threading backend')
     scored_matrix_obj = interface.run_feature_selection(

From 80549b076783cf3913c37c1ad641e2d334fd74e8 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Thu, 22 Sep 2016 14:24:09 +0900
Subject: [PATCH 03/12] resoved bug during making term frequency matrix

---
 DocumentFeatureSelection/bns/bns_python3.py   | 16 +++++--
 .../common/data_converter.py                  |  4 +-
 .../common/labeledMultiDocs2labeledDocsSet.py | 23 ++++++----
 DocumentFeatureSelection/interface.py         |  4 +-
 README.md                                     | 16 +++----
 examples/example_python3.py                   | 42 +++++++++++++++++++
 setup.py                                      |  2 +-
 7 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/DocumentFeatureSelection/bns/bns_python3.py b/DocumentFeatureSelection/bns/bns_python3.py
index 07009d7..2b96235 100644
--- a/DocumentFeatureSelection/bns/bns_python3.py
+++ b/DocumentFeatureSelection/bns/bns_python3.py
@@ -26,7 +26,7 @@ def __check_matrix_form(self, X):
         if n_categories != 2:
             raise Exception('BNS input must be of 2 categories')
 
-    def fit_transform(self, X, y=None, **fit_params):
+    def fit_transform(self, X:csr_matrix, y=None, **fit_params):
         assert isinstance(X, csr_matrix)
 
         if not 'unit_distribution' in fit_params:
@@ -88,7 +88,12 @@ def fit_transform(self, X, y=None, **fit_params):
 
         return bns_featured_csr_matrix
 
-    def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true_index, verbose=False):
+    def docId_word_BNS(self, X:csr_matrix,
+                       feature_index:int,
+                       sample_index:int,
+                       unit_distribution:np.ndarray,
+                       true_index:int,
+                       verbose=False):
 
         assert isinstance(X, csr_matrix)
         assert isinstance(feature_index, int)
@@ -104,7 +109,12 @@ def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true
         )
         return sample_index, feature_index, bns_score
 
-    def bns(self, X, feature_index, sample_index, unit_distribution, true_index=0, verbose=False):
+    def bns(self, X:csr_matrix,
+            feature_index:int,
+            sample_index:int,
+            unit_distribution:np.ndarray,
+            true_index:int=0,
+            verbose:bool=False):
         if true_index==0:
             false_index = 1
         elif true_index==1:
diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py
index 986711a..dc77b66 100644
--- a/DocumentFeatureSelection/common/data_converter.py
+++ b/DocumentFeatureSelection/common/data_converter.py
@@ -149,7 +149,7 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
         logger.debug(msg='Finished pre-processing before CSR matrix')
         csr_matrix_ = crs_matrix_constructor.make_csr_objects(
                 row=row, col=col, data=data,
-                n_feature=len(set_document_information.feature2id)+1,
+                n_feature=len(set_document_information.feature2id),
                 n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
@@ -250,7 +250,7 @@ def labeledMultiDocs2DocFreqMatrix(self,
         logger.debug(msg='Finished pre-processing before CSR matrix')
         csr_matrix_ = crs_matrix_constructor.make_csr_objects(
                 row=row, col=col, data=data,
-                n_feature=len(set_document_information.feature2id)+1,
+                n_feature=len(set_document_information.feature2id),
                 n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
index 267067c..a178210 100644
--- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
+++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
@@ -11,18 +11,22 @@
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
 N_FEATURE_SWITCH_STRATEGY = 1000000
 
+def decode_into_utf8(string:str)->bytes:
+    """* what you can do
+    - convert string into etf-8
+    """
+    return string.encode('utf-8')
+
 def generate_document_dict(document_key:str,
-                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[Union[str,bytes], int]]:
+                           documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]:
     """This function gets Document-frequency count in given list of documents
     """
     assert isinstance(documents, list)
     word_frequencies = [Counter(document) for document in documents]
     document_frequencies = Counter()
     for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys())
-    document_frequency_dict = dict(document_frequencies)
 
-    assert isinstance(document_frequency_dict, dict)
-    return (document_key, document_frequency_dict)
+    return (document_key, document_frequencies)
 
 
 def multiDocs2TermFreqInfo(labeled_documents):
@@ -63,13 +67,13 @@ def multiDocs2TermFreqInfo(labeled_documents):
         if type_flag == set(['str']):
             term_freq = Counter(words_in_docs)
         elif type_flag == set(['tuple']):
-            term_freq = {pickle.dumps(key): value for key,value in dict(Counter(words_in_docs)).items()}
+            term_freq = {pickle.dumps(key): value for key,value in Counter(words_in_docs).items()}
         else:
             raise Exception()
 
         feature_frequency.append(
             numpy.array(
-                [(index_tuple[1], index_tuple[0]) for index_tuple in enumerate(term_freq)],
+                [(index_tuple[0], index_tuple[1]) for index_tuple in term_freq.items()],
                 dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')]
             ))
 
@@ -105,11 +109,12 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
     assert len(type_flag)==1
 
     if type_flag == set(['str']):
-        feature_list = list(set(utils.flatten(labeled_documents.values())))
+        # all features are encoded into utf-8
+        feature_list = [decode_into_utf8(str) for str in list(set(utils.flatten(labeled_documents.values())))]
         feature_list = sorted(feature_list)
         max_lenght = max([len(s) for s in feature_list])
     elif type_flag == set(['tuple']):
-        # make tuple into string
+        # feature tuples are serialized by pickle
         feature_list = list(set(utils.flatten(labeled_documents.values())))
         feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)]
         max_lenght = max([len(s) for s in feature_list]) + 10
@@ -137,7 +142,7 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
         label2id_dict.update({doc_key_freq_tuple[0]: document_index})
         document_index += 1
         if type_flag == set(['str']):
-            doc_freq = doc_key_freq_tuple[1]
+            doc_freq = {decode_into_utf8(key):value for key, value in doc_key_freq_tuple[1].items()}
         elif type_flag == set(['tuple']):
             doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())}
         else:
diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py
index a721d26..8e50448 100644
--- a/DocumentFeatureSelection/interface.py
+++ b/DocumentFeatureSelection/interface.py
@@ -8,6 +8,7 @@
 from typing import List, Dict, Any, Union, Tuple
 from scipy.sparse.csr import csr_matrix
 import logging
+import numpy
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
 METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns']
 N_FEATURE_SWITCH_STRATEGY = 1000000
@@ -97,7 +98,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
             joblib_backend=joblib_backend)
         assert isinstance(matrix_data_object, DataCsrMatrix)
 
-        true_class_index = matrix_data_object.label2id_dict['positive']
+        true_class_index = matrix_data_object.label2id_dict[
+            numpy.where(matrix_data_object.label2id_dict['key'] == b'positive')]['value'][0]
         backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
         scored_sparse_matrix = BNS().fit_transform(
             X=matrix_data_object.csr_matrix_,
diff --git a/README.md b/README.md
index 0100879..fc1a476 100644
--- a/README.md
+++ b/README.md
@@ -112,17 +112,15 @@ or
 
 See scripts in `examples/`
 
-# Performance
 
-With my MacBookPro (late 2015) and version 1.1.
-
-And input data has 98,600 feature dimensions.
+# Change log
 
-- PMI takes around 6 minutes (with both of multiprocessing and multithreading)
+For your reference I checked performance under following environment,
+ 
+- MacBookPro (late 2015) 3.1 GHz Intel Core i7, 16 GB 1867 MHz DDR3
+- input data has 98,600 feature dimensions.
 
 
-# Change log
-
 ## 0.6 2016/04/02
 
 supports PMI and TF-IDF under Python3.x
@@ -148,7 +146,11 @@ Removed a bug when calling n_gram method of DataConverter
 
 * Resolved bottleneck point in pre-processing
 * Fixed a bug which n_jobs parameter does not work in interface
+* PMI takes around 6 minutes (with both of multiprocessing and multithreading)
 
 ## 1.2 2016/9/16
 
 * A bug in calculating TF-IDF score, this bug was resolved.
+
+## 1.3 2016/9/
+
diff --git a/examples/example_python3.py b/examples/example_python3.py
index e6f279e..8837770 100644
--- a/examples/example_python3.py
+++ b/examples/example_python3.py
@@ -58,6 +58,26 @@
 pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
 
 
+input_dict = {
+    "positive": [
+        ["I", "aa", "aa", "aa", "aa", "aa"],
+        ["bb", "aa", "aa", "aa", "aa", "aa"],
+        ["I", "aa", "hero", "some", "ok", "aa"]
+    ],
+    "negative": [
+        ["bb", "bb", "bb"],
+        ["bb", "bb", "bb"],
+        ["hero", "ok", "bb"],
+        ["hero", "cc", "bb"],
+    ]
+}
+tf_idf_scored_object = interface.run_feature_selection(
+    input_dict=input_dict,
+    method='bns',
+    n_jobs=1
+)
+pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
+
 
 # ======================================================================================================
 # expert usage
@@ -99,4 +119,26 @@
     method='tf_idf',
     n_jobs=5
 )
+pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
+
+
+input_dict_tuple_feature = {
+    "positive": [
+        [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ],
+        [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ],
+        [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
+    ],
+    "negative": [
+        [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ],
+        [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ],
+        [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
+    ]
+}
+
+
+tf_idf_scored_object = interface.run_feature_selection(
+    input_dict=input_dict_tuple_feature,
+    method='bns',
+    n_jobs=5
+)
 pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 442317b..6d56571 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 """
 
 __author__ = 'kensuke-mi'
-__version__ = '1.2'
+__version__ = '1.3'
 
 import sys
 from setuptools import setup, find_packages

From 544d94312eac725cccfa12e9131d788ef3aebb17 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Thu, 22 Sep 2016 14:54:36 +0900
Subject: [PATCH 04/12] failed to run cython code with joblib multiprocessing

---
 .gitignore                                  |  1 +
 DocumentFeatureSelection/interface.py       |  6 ++-
 DocumentFeatureSelection/pmi/PMI_python3.py | 23 +++++++---
 DocumentFeatureSelection/pmi/pmi.pyx        | 49 +++++++++++++++++++++
 examples/example_python3.py                 |  3 +-
 setup.py                                    |  7 ++-
 6 files changed, 78 insertions(+), 11 deletions(-)
 create mode 100644 DocumentFeatureSelection/pmi/pmi.pyx

diff --git a/.gitignore b/.gitignore
index ed673fe..24909e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 
 # C extensions
 *.so
+*.c
 
 # Distribution / packaging
 .Python
diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py
index 8e50448..be84b64 100644
--- a/DocumentFeatureSelection/interface.py
+++ b/DocumentFeatureSelection/interface.py
@@ -26,7 +26,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
                           ngram:int=1,
                           n_jobs:int=1,
                           joblib_backend='auto',
-                          matrix_form=None)->ScoredResultObject:
+                          matrix_form=None,
+                          use_cython:bool=False)->ScoredResultObject:
     if not method in METHOD_NAMES:
         raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method))
 
@@ -57,7 +58,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
             scored_sparse_matrix = PMI().fit_transform(X=matrix_data_object.csr_matrix_,
                                                        n_docs_distribution=matrix_data_object.n_docs_distribution,
                                                        n_jobs=n_jobs,
-                                                       joblib_backend=backend_strategy)
+                                                       joblib_backend=backend_strategy,
+                                                       use_cython=use_cython)
             assert isinstance(scored_sparse_matrix, csr_matrix)
         elif method == 'soa':
             backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py
index c5760ee..96f0472 100644
--- a/DocumentFeatureSelection/pmi/PMI_python3.py
+++ b/DocumentFeatureSelection/pmi/PMI_python3.py
@@ -5,6 +5,7 @@
 from __future__ import division
 from scipy.sparse import csr_matrix
 from logging import getLogger, StreamHandler
+
 import logging
 import joblib
 import math
@@ -27,7 +28,12 @@ class PMI(object):
     def __init__(self):
         pass
 
-    def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_backend='multiprocessing'):
+    def fit_transform(self, X,
+                      n_docs_distribution,
+                      n_jobs=1,
+                      verbose=False,
+                      joblib_backend='multiprocessing',
+                      use_cython:bool=False):
         """Main method of PMI class.
         """
         assert isinstance(X, csr_matrix)
@@ -41,6 +47,13 @@ def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_
         logger.debug(msg='Start calculating PMI with n(process)={}'.format(n_jobs))
         logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))
 
+        if use_cython:
+            import pyximport; pyximport.install()
+            from DocumentFeatureSelection.pmi import pmi
+            self.pmi = pmi
+        else:
+            self.pmi = self.pmi
+
         pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
             joblib.delayed(self.docId_word_PMI)(
                 X=X,
@@ -71,7 +84,8 @@ def docId_word_PMI(self, X:csr_matrix,
                        n_total_doc:int,
                        feature_index:int,
                        sample_index:int,
-                       verbose=False):
+                       verbose=False,
+                       use_cython:bool=False):
         """Calculate PMI score for fit_format()
 
         :param X:
@@ -81,11 +95,6 @@ def docId_word_PMI(self, X:csr_matrix,
         :param label:
         :return:
         """
-        assert isinstance(X, csr_matrix)
-        assert isinstance(n_docs_distribution, numpy.ndarray)
-        assert isinstance(feature_index, int)
-        assert isinstance(sample_index, int)
-
         pmi_score = self.pmi(
             X=X,
             n_docs_distribution=n_docs_distribution,
diff --git a/DocumentFeatureSelection/pmi/pmi.pyx b/DocumentFeatureSelection/pmi/pmi.pyx
new file mode 100644
index 0000000..5e72b6e
--- /dev/null
+++ b/DocumentFeatureSelection/pmi/pmi.pyx
@@ -0,0 +1,49 @@
+import numpy
+import math
+
+def pmi(X,
+        n_docs_distribution,
+        n_total_doc,
+        feature_index,
+        sample_index, verbose=False):
+    """get PMI score for given feature & sample index
+
+    :param X:
+    :param feature_index:
+    :param sample_index:
+    :return:
+    """
+    matrix_size = X.shape
+    sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
+
+    # n_11 is #docs having feature(i.e. word) in the specified index(label)
+    n_11 = X[sample_index, feature_index]
+    # n_01 is #docs NOT having feature in the specified index(label)
+    n_01 = n_docs_distribution[sample_index] - n_11
+    # n_10 is #docs having feature in NOT specified index(indexes except specified index)
+    n_10 = X[sample_indexes, feature_index].sum()
+    # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
+    n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
+
+    if verbose:
+        print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
+        print('n_11:{} n_01:{} n_10:{} n_00:{}'.format(
+            n_11,
+            n_01,
+            n_10,
+            n_00
+        ))
+
+    if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
+        return 0
+    else:
+        temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
+        temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
+        temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
+        temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
+        score = temp1 + temp2 + temp3 + temp4
+
+        if score < 0:
+            raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.')
+
+        return score
\ No newline at end of file
diff --git a/examples/example_python3.py b/examples/example_python3.py
index 8837770..edbe7c6 100644
--- a/examples/example_python3.py
+++ b/examples/example_python3.py
@@ -35,7 +35,8 @@
     input_dict=input_dict,
     method='pmi',
     ngram=1,
-    n_jobs=5
+    n_jobs=1,
+    use_cython=True
 )
 pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
 
diff --git a/setup.py b/setup.py
index 6d56571..d0ef760 100644
--- a/setup.py
+++ b/setup.py
@@ -7,12 +7,15 @@
 
 import sys
 from setuptools import setup, find_packages
+from Cython.Build import cythonize
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
 
 python_version = sys.version_info
 
 if python_version >= (3, 0, 0):
     install_requires = ['six', 'setuptools>=1.0', 'joblib',
-                        'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc']
+                        'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython']
 
 
 try:
@@ -49,4 +52,6 @@
     install_requires=install_requires,
     setup_requires=['six', 'setuptools>=1.0'],
     classifiers=[],
+    cmdclass={'build_ext': build_ext},
+    ext_modules=[Extension("pmi", ["DocumentFeatureSelection/pmi/pmi.pyx"])]
 )

From c7a5edfa48daf90f4fb2c6d67d8bffafb60806f8 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Tue, 27 Sep 2016 10:11:24 +0900
Subject: [PATCH 05/12] used skleran for feature vectorising

---
 .../common/crs_matrix_constructor.py          |   3 +-
 .../common/data_converter.py                  |  69 ++---------
 .../common/labeledMultiDocs2labeledDocsSet.py | 116 ++++--------------
 DocumentFeatureSelection/common/utils.py      |  35 +++---
 DocumentFeatureSelection/interface.py         |   5 +-
 DocumentFeatureSelection/models.py            |  18 +--
 examples/example_python3.py                   |  28 ++---
 setup.py                                      |   2 +-
 8 files changed, 81 insertions(+), 195 deletions(-)

diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py
index f599c7f..457112d 100644
--- a/DocumentFeatureSelection/common/crs_matrix_constructor.py
+++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py
@@ -4,8 +4,9 @@
 import sys
 import logging
 import numpy
-from typing import List, Tuple
+from typing import List, Tuple, Dict
 from scipy.sparse import csr_matrix
+from sklearn.feature_extraction import DictVectorizer
 
 logging.basicConfig(format='%(asctime)s %(message)s',
                     datefmt='%m/%d/%Y %I:%M:%S %p',
diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py
index dc77b66..15c035a 100644
--- a/DocumentFeatureSelection/common/data_converter.py
+++ b/DocumentFeatureSelection/common/data_converter.py
@@ -68,11 +68,12 @@ def __check_data_structure(self, labeled_documents):
 
         return True
 
-    def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array):
+
+    def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int]):
         """Count term-distribution per label.
         """
         assert isinstance(labeled_documents, dict)
-        assert isinstance(label2id, numpy.ndarray)
+        assert isinstance(label2id, dict)
 
         # count total term-frequency per label
         term_frequency_distribution = {
@@ -85,12 +86,14 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]
         term_frequency_distribution_list = [0] * len(labeled_documents.keys())
 
         for label_string, n_doc in term_frequency_distribution.items():
-            term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            term_index = label2id[label_string]
             term_frequency_distribution_list[term_index] = n_doc
 
         return numpy.array(term_frequency_distribution_list, dtype='i8')
 
-    def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array)->numpy.ndarray:
+
+    def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int])->numpy.ndarray:
         """This method count n(docs) per label.
 
         :param labeled_documents:
@@ -98,7 +101,7 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab
         :return:
         """
         assert isinstance(labeled_documents, dict)
-        assert isinstance(label2id, numpy.ndarray)
+        assert isinstance(label2id, dict)
 
         # count n(docs) per label
         n_doc_distribution = {
@@ -111,7 +114,8 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab
         n_doc_distribution_list = [0] * len(labeled_documents.keys())
 
         for label_string, n_doc in n_doc_distribution.items():
-            docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
+            docs_index = label2id[label_string]
             n_doc_distribution_list[docs_index] = n_doc
 
         return numpy.array(n_doc_distribution_list, dtype='i8')
@@ -132,25 +136,6 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
         logger.debug(msg='Now pre-processing before CSR matrix')
         # convert data structure
         set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2TermFreqInfo(labeled_documents)
-        assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation)
-        logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature_frequency)))
-        if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000:
-            joblib_backend = 'threading'
-        if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000:
-            joblib_backend = 'multiprocessing'
-
-        # make set of tuples to construct csr_matrix
-        row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
-            feature_frequency=set_document_information.feature_frequency,
-            vocabulary=set_document_information.feature2id,
-            n_jobs=n_jobs,
-            joblib_backend=joblib_backend
-        )
-        logger.debug(msg='Finished pre-processing before CSR matrix')
-        csr_matrix_ = crs_matrix_constructor.make_csr_objects(
-                row=row, col=col, data=data,
-                n_feature=len(set_document_information.feature2id),
-                n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
         n_docs_distribution = self.count_document_distribution(
@@ -163,12 +148,8 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
             label2id=set_document_information.label2id
         )
 
-        assert isinstance(csr_matrix_, csr_matrix)
-        assert isinstance(set_document_information.label2id, numpy.ndarray)
-        assert isinstance(set_document_information.label2id, numpy.ndarray)
-        assert isinstance(n_docs_distribution, numpy.ndarray)
         return DataCsrMatrix(
-                csr_matrix_,
+                set_document_information.matrix_object,
                 set_document_information.label2id,
                 set_document_information.feature2id,
                 n_docs_distribution, term_frequency_distribution)
@@ -231,27 +212,8 @@ def labeledMultiDocs2DocFreqMatrix(self,
         logger.debug(msg='Now pre-processing before CSR matrix')
         # convert data structure
         set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2DocFreqInfo(labeled_documents,
-                                                                                         n_jobs=n_jobs,
-                                                                                         joblib_backend=joblib_backend)
+                                                                                         n_jobs=n_jobs)
         assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation)
-        logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id)))
-        if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000:
-            joblib_backend = 'threading'
-        if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000:
-            joblib_backend = 'multiprocessing'
-
-        # make set of tuples to construct csr_matrix
-        row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
-            feature_frequency=set_document_information.feature_frequency,
-            vocabulary=set_document_information.feature2id,
-            n_jobs=n_jobs,
-            joblib_backend=joblib_backend
-        )
-        logger.debug(msg='Finished pre-processing before CSR matrix')
-        csr_matrix_ = crs_matrix_constructor.make_csr_objects(
-                row=row, col=col, data=data,
-                n_feature=len(set_document_information.feature2id),
-                n_docs=len(set_document_information.feature_frequency))
 
         # count n(docs) per label
         n_docs_distribution = self.count_document_distribution(
@@ -263,13 +225,8 @@ def labeledMultiDocs2DocFreqMatrix(self,
             labeled_documents=labeled_documents,
             label2id=set_document_information.label2id
         )
-
-        assert isinstance(csr_matrix_, csr_matrix)
-        assert isinstance(set_document_information.label2id, numpy.ndarray)
-        assert isinstance(set_document_information.feature2id, numpy.ndarray)
-        assert isinstance(n_docs_distribution, numpy.ndarray)
         return DataCsrMatrix(
-                csr_matrix_,
+                set_document_information.matrix_object,
                 set_document_information.label2id,
                 set_document_information.feature2id,
                 n_docs_distribution, term_frequency_distribution)
diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
index a178210..f490eaf 100644
--- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
+++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
@@ -3,14 +3,15 @@
 from DocumentFeatureSelection.common import utils
 from DocumentFeatureSelection.models import SetDocumentInformation
 from DocumentFeatureSelection import init_logger
+from sklearn.feature_extraction import DictVectorizer
 from typing import Dict, List, Tuple, Any, Union
 import logging
 import joblib
-import numpy
-import pickle
+import itertools
 logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
 N_FEATURE_SWITCH_STRATEGY = 1000000
 
+
 def decode_into_utf8(string:str)->bytes:
     """* what you can do
     - convert string into etf-8
@@ -37,52 +38,17 @@ def multiDocs2TermFreqInfo(labeled_documents):
     """
     assert isinstance(labeled_documents, dict)
 
-    vocabulary_list = list(set(utils.flatten(labeled_documents.values())))
-    vocabulary_list = sorted(vocabulary_list)
-    type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
-    if type_flag == set(['str']):
-        feature_list = list(set(utils.flatten(labeled_documents.values())))
-        feature_list = sorted(feature_list)
-        max_lenght = max([len(s) for s in feature_list])
-    elif type_flag == set(['tuple']):
-        # make tuple into string
-        feature_list = list(set(utils.flatten(labeled_documents.values())))
-        feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)]
-        max_lenght = max([len(s) for s in feature_list]) + 10
-    else:
-        raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag))
-
-    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: ndarray
-
-    # make label: id dictionary structure
-    label2id_dict = {}
-    # make list of Term-Frequency
-    feature_frequency = []
-    document_index = 0
-
-    for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]):
-        label2id_dict.update({key: document_index})
-        document_index += 1
-        words_in_docs = utils.flatten(docs)
-        if type_flag == set(['str']):
-            term_freq = Counter(words_in_docs)
-        elif type_flag == set(['tuple']):
-            term_freq = {pickle.dumps(key): value for key,value in Counter(words_in_docs).items()}
-        else:
-            raise Exception()
-
-        feature_frequency.append(
-            numpy.array(
-                [(index_tuple[0], index_tuple[1]) for index_tuple in term_freq.items()],
-                dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')]
-            ))
-
-    label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10
-    label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')])
-    assert isinstance(feature2id, numpy.ndarray)
-    assert isinstance(feature_frequency, list)
-    assert isinstance(label2id, numpy.ndarray)
-    return SetDocumentInformation(feature_frequency, label2id, feature2id)
+    counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents))))
+                         for label, documents in labeled_documents.items()]
+    feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency]
+
+    # use sklearn feature-extraction
+    vec = DictVectorizer()
+    matrix_object = vec.fit_transform(feature_documents).tocsr()
+    feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
+    label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in  enumerate(counted_frequency)}
+
+    return SetDocumentInformation(matrix_object, label2id, feature2id)
 
 
 def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str:
@@ -100,7 +66,6 @@ def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str:
 
 
 def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple[Any]]]]],
-                          joblib_backend:str='auto',
                           n_jobs:int=1)->SetDocumentInformation:
     """This function generates information for constructing document-frequency matrix.
     """
@@ -108,50 +73,15 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple
     type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
     assert len(type_flag)==1
 
-    if type_flag == set(['str']):
-        # all features are encoded into utf-8
-        feature_list = [decode_into_utf8(str) for str in list(set(utils.flatten(labeled_documents.values())))]
-        feature_list = sorted(feature_list)
-        max_lenght = max([len(s) for s in feature_list])
-    elif type_flag == set(['tuple']):
-        # feature tuples are serialized by pickle
-        feature_list = list(set(utils.flatten(labeled_documents.values())))
-        feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)]
-        max_lenght = max([len(s) for s in feature_list]) + 10
-    else:
-        raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag))
-
-    feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))])  # type: ndarray
-
-    # make label: id dictionary structure
-    label2id_dict = {}
-    # list of document-frequency array
-    feature_frequency = []
-
-    if joblib_backend == 'auto' and len(feature2id) >= N_FEATURE_SWITCH_STRATEGY:
-        joblib_backend = 'threading'
-    if joblib_backend == 'auto' and len(feature2id) < N_FEATURE_SWITCH_STRATEGY:
-        joblib_backend = 'multiprocessing'
-
-    counted_frequency = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
+    counted_frequency = joblib.Parallel(n_jobs=n_jobs)(
         joblib.delayed(generate_document_dict)(key, docs)
         for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]))
+    feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency]
+
+    # use sklearn feature-extraction
+    vec = DictVectorizer()
+    matrix_object = vec.fit_transform(feature_documents).tocsr()
+    feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())}
+    label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in  enumerate(counted_frequency)}
 
-    document_index = 0
-    for doc_key_freq_tuple in counted_frequency:
-        label2id_dict.update({doc_key_freq_tuple[0]: document_index})
-        document_index += 1
-        if type_flag == set(['str']):
-            doc_freq = {decode_into_utf8(key):value for key, value in doc_key_freq_tuple[1].items()}
-        elif type_flag == set(['tuple']):
-            doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())}
-        else:
-            raise Exception()
-        feature_frequency.append(
-            numpy.array(
-                list(doc_freq.items()),
-                dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')]
-            ))
-    label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10
-    label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')])
-    return SetDocumentInformation(feature_frequency, label2id, feature2id)
\ No newline at end of file
+    return SetDocumentInformation(matrix_object, label2id, feature2id)
\ No newline at end of file
diff --git a/DocumentFeatureSelection/common/utils.py b/DocumentFeatureSelection/common/utils.py
index 02e3727..985eae3 100644
--- a/DocumentFeatureSelection/common/utils.py
+++ b/DocumentFeatureSelection/common/utils.py
@@ -83,28 +83,23 @@ def make_non_zero_information(weight_csr_matrix:csr_matrix):
 
 def get_label(row_col_val_tuple, label_id):
     assert isinstance(row_col_val_tuple, ROW_COL_VAL)
-    assert isinstance(label_id, numpy.ndarray)
+    #assert isinstance(label_id, numpy.ndarray)
+    assert isinstance(label_id, dict)
 
-    label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value']
-    try:
-        original_label = pickle.loads(label)
-    except (pickle.UnpicklingError, KeyError):
-        original_label = label.decode('utf-8')
+    #label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value']
+    label = label_id[row_col_val_tuple.row]
 
-    return original_label
+    return label
 
 
 def get_word(row_col_val_tuple, vocabulary):
     assert isinstance(row_col_val_tuple, ROW_COL_VAL)
-    assert isinstance(vocabulary, numpy.ndarray)
-
-    vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value']
-    try:
-        original_vocab = pickle.loads(vocab)
-    except (pickle.UnpicklingError, KeyError):
-        original_vocab = vocab.decode('utf-8')
+    #assert isinstance(vocabulary, numpy.ndarray)
+    assert isinstance(vocabulary, dict)
+    #vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value']
+    vocab = vocabulary[row_col_val_tuple.col]
 
-    return original_vocab
+    return vocab
 
 
 def SUB_FUNC_feature_extraction(row_col_val_tuple:typing.Tuple[int,int,int], id2label:numpy.ndarray, id2vocab:numpy.ndarray):
@@ -154,16 +149,18 @@ def get_feature_dictionary(weighted_matrix, vocabulary, label_group_dict, n_jobs
     :param bool cut_zero: return all result or not. If cut_zero = True, the method cuts zero features.
     """
     assert isinstance(weighted_matrix, csr_matrix)
-    assert isinstance(vocabulary, numpy.ndarray)
-    assert isinstance(label_group_dict, numpy.ndarray)
+    assert isinstance(vocabulary, dict)
+    assert isinstance(label_group_dict, dict)
     assert isinstance(n_jobs, int)
 
     logger.debug(msg='Start making scored dictionary object from scored matrix')
     logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1]))
 
     value_index_items = make_non_zero_information(weighted_matrix)
-    id2label = numpy.array([(element['value'], element['key']) for element in label_group_dict], dtype=[('key', '<i'), ('value', label_group_dict.dtype['key'])])
-    id2vocab = numpy.array([(element['value'], element['key']) for element in vocabulary], dtype=[('key', '<i'), ('value', vocabulary.dtype['key'])])
+    #id2label = numpy.array([(element['value'], element['key']) for element in label_group_dict], dtype=[('key', '<i'), ('value', label_group_dict.dtype['key'])])
+    #id2vocab = numpy.array([(element['value'], element['key']) for element in vocabulary], dtype=[('key', '<i'), ('value', vocabulary.dtype['key'])])
+    id2label = {value:key for key, value in label_group_dict.items()}
+    id2vocab = {value:key for key, value in vocabulary.items()}
 
     score_objects = joblib.Parallel(n_jobs=n_jobs)(
         joblib.delayed(SUB_FUNC_feature_extraction)(
diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py
index be84b64..3889208 100644
--- a/DocumentFeatureSelection/interface.py
+++ b/DocumentFeatureSelection/interface.py
@@ -68,6 +68,7 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
                                                        n_jobs=n_jobs,
                                                        joblib_backend=backend_strategy)
             assert isinstance(scored_sparse_matrix, csr_matrix)
+
     elif method == 'soa' and matrix_form == 'term_freq':
         # getting term-frequency matrix.
         # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
@@ -85,6 +86,7 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
                                                    n_jobs=n_jobs,
                                                    joblib_backend=backend_strategy)
         assert isinstance(scored_sparse_matrix, csr_matrix)
+
     elif method == 'bns':
         if not 'positive' in input_dict:
             raise KeyError('input_dict must have "positive" key')
@@ -100,8 +102,7 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
             joblib_backend=joblib_backend)
         assert isinstance(matrix_data_object, DataCsrMatrix)
 
-        true_class_index = matrix_data_object.label2id_dict[
-            numpy.where(matrix_data_object.label2id_dict['key'] == b'positive')]['value'][0]
+        true_class_index = matrix_data_object.label2id_dict['positive']
         backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary)
         scored_sparse_matrix = BNS().fit_transform(
             X=matrix_data_object.csr_matrix_,
diff --git a/DocumentFeatureSelection/models.py b/DocumentFeatureSelection/models.py
index f3246ab..c6f7b06 100644
--- a/DocumentFeatureSelection/models.py
+++ b/DocumentFeatureSelection/models.py
@@ -5,12 +5,12 @@
 FeatureType = TypeVar('T', str, Tuple[Any])
 
 class SetDocumentInformation(object):
-    __slots__ = ['feature_frequency', 'label2id', 'feature2id']
+    __slots__ = ['matrix_object', 'label2id', 'feature2id']
 
-    def __init__(self, feature_frequency:List[array],
-                 label2id:ndarray,
-                 feature2id:ndarray):
-        self.feature_frequency = feature_frequency
+    def __init__(self, matrix_object:Union[csr_matrix, ndarray],
+                 label2id:Dict[str,int],
+                 feature2id:Dict[str,int]):
+        self.matrix_object = matrix_object
         self.label2id = label2id
         self.feature2id = feature2id
 
@@ -19,10 +19,10 @@ class DataCsrMatrix(object):
     __slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution']
 
     def __init__(self, csr_matrix_:csr_matrix,
-                 label2id_dict:array,
-                 vocabulary:ndarray,
-                 n_docs_distribution:array,
-                 n_term_freq_distribution:array):
+                 label2id_dict:Dict[str,int],
+                 vocabulary:Dict[str,int],
+                 n_docs_distribution:ndarray,
+                 n_term_freq_distribution:ndarray):
         self.csr_matrix_ = csr_matrix_
         self.label2id_dict = label2id_dict
         self.vocabulary = vocabulary
diff --git a/examples/example_python3.py b/examples/example_python3.py
index edbe7c6..800c90b 100644
--- a/examples/example_python3.py
+++ b/examples/example_python3.py
@@ -31,12 +31,21 @@
     ]
 }
 
+
+tf_idf_scored_object = interface.run_feature_selection(
+    input_dict=input_dict,
+    method='tf_idf',
+    ngram=1,
+    n_jobs=5,
+    use_cython=False
+)
+
 pmi_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
     method='pmi',
     ngram=1,
     n_jobs=1,
-    use_cython=True
+    use_cython=False
 )
 pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
 
@@ -50,15 +59,6 @@
 pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary())
 
 
-tf_idf_scored_object = interface.run_feature_selection(
-    input_dict=input_dict,
-    method='tf_idf',
-    ngram=1,
-    n_jobs=5
-)
-pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
-
-
 input_dict = {
     "positive": [
         ["I", "aa", "aa", "aa", "aa", "aa"],
@@ -72,12 +72,12 @@
         ["hero", "cc", "bb"],
     ]
 }
-tf_idf_scored_object = interface.run_feature_selection(
+bns_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
     method='bns',
     n_jobs=1
 )
-pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
+pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary())
 
 
 # ======================================================================================================
@@ -137,9 +137,9 @@
 }
 
 
-tf_idf_scored_object = interface.run_feature_selection(
+bns_scored_object = interface.run_feature_selection(
     input_dict=input_dict_tuple_feature,
     method='bns',
     n_jobs=5
 )
-pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
\ No newline at end of file
+pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary())
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d0ef760..5d03ace 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 
 if python_version >= (3, 0, 0):
     install_requires = ['six', 'setuptools>=1.0', 'joblib',
-                        'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython']
+                        'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn']
 
 
 try:

From 1c24bb3ec76e0618b8aaa87f756d75b0bd12386e Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Tue, 27 Sep 2016 14:30:27 +0900
Subject: [PATCH 06/12] added cython code, but having problem in cthonize

---
 DocumentFeatureSelection/pmi/PMI_python3.py | 140 +++++++++++---------
 DocumentFeatureSelection/pmi/pmi.pyx        |  49 -------
 DocumentFeatureSelection/pmi/pmi_cython.pyx |  71 ++++++++++
 examples/check_performance.py               |  23 +++-
 examples/example_python3.py                 |  14 +-
 setup.py                                    |   8 +-
 6 files changed, 185 insertions(+), 120 deletions(-)
 delete mode 100644 DocumentFeatureSelection/pmi/pmi.pyx
 create mode 100644 DocumentFeatureSelection/pmi/pmi_cython.pyx

diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py
index 96f0472..8932f34 100644
--- a/DocumentFeatureSelection/pmi/PMI_python3.py
+++ b/DocumentFeatureSelection/pmi/PMI_python3.py
@@ -24,6 +24,60 @@
 # TODO normzalized pmiの導入
 # http://sucrose.hatenablog.com/entry/2014/12/02/235959
 
+
+def pmi(X:csr_matrix,
+        n_docs_distribution:numpy.ndarray,
+        n_total_doc:int,
+        feature_index:int,
+        sample_index:int, verbose=False):
+    """get PMI score for given feature & sample index
+
+    :param X:
+    :param feature_index:
+    :param sample_index:
+    :return:
+    """
+    assert isinstance(X, csr_matrix)
+    assert isinstance(n_docs_distribution, numpy.ndarray)
+    assert isinstance(feature_index, int)
+    assert isinstance(sample_index, int)
+
+    matrix_size = X.shape
+    sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
+
+    # n_11 is #docs having feature(i.e. word) in the specified index(label)
+    n_11 = X[sample_index, feature_index]
+    # n_01 is #docs NOT having feature in the specified index(label)
+    n_01 = n_docs_distribution[sample_index] - n_11
+    # n_10 is #docs having feature in NOT specified index(indexes except specified index)
+    n_10 = X[sample_indexes, feature_index].sum()
+    # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
+    n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
+
+    if verbose:
+        logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
+        logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format(
+            n_11,
+            n_01,
+            n_10,
+            n_00
+        ))
+
+    if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
+        return 0
+    else:
+        temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
+        temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
+        temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
+        temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
+        score = temp1 + temp2 + temp3 + temp4
+
+        if score < 0:
+            raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.')
+
+        return score
+
+
 class PMI(object):
     def __init__(self):
         pass
@@ -49,23 +103,29 @@ def fit_transform(self, X,
 
         if use_cython:
             import pyximport; pyximport.install()
-            from DocumentFeatureSelection.pmi import pmi
-            self.pmi = pmi
+            from DocumentFeatureSelection.pmi.pmi_cython import main
+            logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
+            print(X.toarray())
+            pmi_score_csr_source = main(X=X,
+                                        n_docs_distribution=n_docs_distribution,
+                                        sample_range=sample_range,
+                                        feature_range=feature_range,
+                                        n_total_doc=n_total_document)
+
         else:
-            self.pmi = self.pmi
-
-        pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
-            joblib.delayed(self.docId_word_PMI)(
-                X=X,
-                n_docs_distribution=n_docs_distribution,
-                feature_index=feature_index,
-                sample_index=sample_index,
-                n_total_doc=n_total_document,
-                verbose=verbose
+            self.pmi = pmi
+            pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
+                joblib.delayed(self.docId_word_PMI)(
+                    X=X,
+                    n_docs_distribution=n_docs_distribution,
+                    feature_index=feature_index,
+                    sample_index=sample_index,
+                    n_total_doc=n_total_document,
+                    verbose=verbose
+                )
+                for sample_index in sample_range
+                for feature_index in feature_range
             )
-            for sample_index in sample_range
-            for feature_index in feature_range
-        )
 
         row_list = [t[0] for t in pmi_score_csr_source]
         col_list = [t[1] for t in pmi_score_csr_source]
@@ -105,54 +165,4 @@ def docId_word_PMI(self, X:csr_matrix,
         )
         return sample_index, feature_index, pmi_score
 
-    def pmi(self, X:csr_matrix,
-            n_docs_distribution:numpy.ndarray,
-            n_total_doc:int,
-            feature_index:int,
-            sample_index:int, verbose=False):
-        """get PMI score for given feature & sample index
-
-        :param X:
-        :param feature_index:
-        :param sample_index:
-        :return:
-        """
-        assert isinstance(X, csr_matrix)
-        assert isinstance(n_docs_distribution, numpy.ndarray)
-        assert isinstance(feature_index, int)
-        assert isinstance(sample_index, int)
-
-        matrix_size = X.shape
-        sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
-
-        # n_11 is #docs having feature(i.e. word) in the specified index(label)
-        n_11 = X[sample_index, feature_index]
-        # n_01 is #docs NOT having feature in the specified index(label)
-        n_01 = n_docs_distribution[sample_index] - n_11
-        # n_10 is #docs having feature in NOT specified index(indexes except specified index)
-        n_10 = X[sample_indexes, feature_index].sum()
-        # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
-        n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
-
-        if verbose:
-            logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
-            logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format(
-                n_11,
-                n_01,
-                n_10,
-                n_00
-            ))
-
-        if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
-            return 0
-        else:
-            temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
-            temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
-            temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
-            temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
-            score = temp1 + temp2 + temp3 + temp4
-
-            if score < 0:
-                raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.')
 
-            return score
diff --git a/DocumentFeatureSelection/pmi/pmi.pyx b/DocumentFeatureSelection/pmi/pmi.pyx
deleted file mode 100644
index 5e72b6e..0000000
--- a/DocumentFeatureSelection/pmi/pmi.pyx
+++ /dev/null
@@ -1,49 +0,0 @@
-import numpy
-import math
-
-def pmi(X,
-        n_docs_distribution,
-        n_total_doc,
-        feature_index,
-        sample_index, verbose=False):
-    """get PMI score for given feature & sample index
-
-    :param X:
-    :param feature_index:
-    :param sample_index:
-    :return:
-    """
-    matrix_size = X.shape
-    sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
-
-    # n_11 is #docs having feature(i.e. word) in the specified index(label)
-    n_11 = X[sample_index, feature_index]
-    # n_01 is #docs NOT having feature in the specified index(label)
-    n_01 = n_docs_distribution[sample_index] - n_11
-    # n_10 is #docs having feature in NOT specified index(indexes except specified index)
-    n_10 = X[sample_indexes, feature_index].sum()
-    # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
-    n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
-
-    if verbose:
-        print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
-        print('n_11:{} n_01:{} n_10:{} n_00:{}'.format(
-            n_11,
-            n_01,
-            n_10,
-            n_00
-        ))
-
-    if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
-        return 0
-    else:
-        temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
-        temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
-        temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
-        temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
-        score = temp1 + temp2 + temp3 + temp4
-
-        if score < 0:
-            raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.')
-
-        return score
\ No newline at end of file
diff --git a/DocumentFeatureSelection/pmi/pmi_cython.pyx b/DocumentFeatureSelection/pmi/pmi_cython.pyx
new file mode 100644
index 0000000..5acbde5
--- /dev/null
+++ b/DocumentFeatureSelection/pmi/pmi_cython.pyx
@@ -0,0 +1,71 @@
+import math
+import scipy
+cimport numpy as np
+
+cdef float pmi(np.ndarray[np.float64_t, ndim=2] X,
+        int n_samples,
+        np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
+        int n_total_doc,
+        int feature_index,
+        int sample_index):
+    """get PMI score for given feature & sample index
+    """
+    cdef i
+    sample_indexes = [i for i in range(0, n_samples) if i != sample_index]
+
+    # n_11 is #docs having feature(i.e. word) in the specified index(label)
+    cdef float n_11 = X[sample_index, feature_index]
+    # n_01 is #docs NOT having feature in the specified index(label)
+    cdef float n_01 = n_docs_distribution[sample_index] - n_11
+    # n_10 is #docs having feature in NOT specified index(indexes except specified index)
+    cdef float n_10 = X[sample_indexes, feature_index].sum()
+    # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index)
+    cdef float n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index])
+
+    cdef float temp1, temp2, temp3, temp4, score
+
+    if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0:
+        return 0
+    else:
+        temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2)
+        temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2)
+        temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2)
+        temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
+        score = temp1 + temp2 + temp3 + temp4
+
+        if score < 0:
+            print(score)
+            raise Exception('PMI score={}. Score under 0 is detected. Something strange in Input matrix. Check your input matrix.'.format(score))
+
+        return score
+
+
+def main(X,
+        np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
+        int n_total_doc,
+        sample_range,
+        feature_range):
+    """What you can do
+    - calculate PMI score based on given data.
+    - The function returns list of tuple, whose element is (sample_index, feature_index, score)
+    - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature.
+    """
+
+    cdef int n_samples = X.shape[0]
+
+    if isinstance(X, scipy.sparse.csr_matrix):
+        X = X.toarray()
+
+    cdef int sample_index, feature_index
+    pmi_score_csr_source = [
+        (
+            sample_index,
+            feature_index,
+            pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index)
+         )
+        for sample_index in sample_range
+        for feature_index in feature_range
+    ]
+    non_zero_pmi_score_csr_source = [score_tuple for score_tuple in pmi_score_csr_source if not score_tuple[2]==0]
+
+    return non_zero_pmi_score_csr_source
\ No newline at end of file
diff --git a/examples/check_performance.py b/examples/check_performance.py
index c45cd00..b254aeb 100644
--- a/examples/check_performance.py
+++ b/examples/check_performance.py
@@ -1,6 +1,7 @@
 from DocumentFeatureSelection import interface
 import nltk
 import logging
+import time
 try:
     import line_profiler
 except:
@@ -12,16 +13,20 @@
 #@profile
 def pmi_with_parallel(input_corpus):
     logging.debug(msg='With multiprocessing backend')
+    start = time.time()
     scored_matrix_obj = interface.run_feature_selection(
         input_dict=input_corpus,
         method='pmi',
         n_jobs=-1,
         joblib_backend='multiprocessing'
     )
+    elapsed_time = time.time() - start
+    print ("elapsed_time with multiprocess:{}".format(elapsed_time)) + "[sec]"
 
 
 #@profile
 def pmi_with_threading(input_corpus):
+    start = time.time()
     logging.debug(msg='With threading backend')
     scored_matrix_obj = interface.run_feature_selection(
         input_dict=input_corpus,
@@ -29,6 +34,21 @@ def pmi_with_threading(input_corpus):
         n_jobs=-1,
         joblib_backend='threading'
     )
+    elapsed_time = time.time() - start
+    print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]"
+
+
+def pmi_with_cython(input_corpus):
+    logging.debug(msg='With cython is True')
+    start = time.time()
+    scored_matrix_obj = interface.run_feature_selection(
+        input_dict=input_corpus,
+        method='pmi',
+        n_jobs=-1,
+        use_cython=True
+    )
+    elapsed_time = time.time() - start
+    print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]"
 
 from nltk.corpus import gutenberg
 from nltk.corpus import webtext
@@ -47,5 +67,6 @@ def pmi_with_threading(input_corpus):
     'gutenberg': list(gutenberg_corpus)
     }
 
+pmi_with_cython(input_corpus)
 pmi_with_parallel(input_corpus)
-pmi_with_threading(input_corpus)
\ No newline at end of file
+pmi_with_threading(input_corpus)
diff --git a/examples/example_python3.py b/examples/example_python3.py
index 800c90b..19241c7 100644
--- a/examples/example_python3.py
+++ b/examples/example_python3.py
@@ -36,8 +36,7 @@
     input_dict=input_dict,
     method='tf_idf',
     ngram=1,
-    n_jobs=5,
-    use_cython=False
+    n_jobs=5
 )
 
 pmi_scored_object = interface.run_feature_selection(
@@ -49,6 +48,17 @@
 )
 pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
 
+# you can use cython version pmi also
+# !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place.
+pmi_scored_object_cython = interface.run_feature_selection(
+    input_dict=input_dict,
+    method='pmi',
+    ngram=1,
+    n_jobs=1,
+    use_cython=True
+)
+pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary())
+
 
 soa_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
diff --git a/setup.py b/setup.py
index 5d03ace..4431201 100644
--- a/setup.py
+++ b/setup.py
@@ -7,17 +7,18 @@
 
 import sys
 from setuptools import setup, find_packages
+import numpy
 from Cython.Build import cythonize
 from distutils.extension import Extension
 from Cython.Distutils import build_ext
 
+
 python_version = sys.version_info
 
 if python_version >= (3, 0, 0):
     install_requires = ['six', 'setuptools>=1.0', 'joblib',
                         'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn']
 
-
 try:
     import pypandoc
     long_description = pypandoc.convert('README.md', 'rst')
@@ -53,5 +54,6 @@
     setup_requires=['six', 'setuptools>=1.0'],
     classifiers=[],
     cmdclass={'build_ext': build_ext},
-    ext_modules=[Extension("pmi", ["DocumentFeatureSelection/pmi/pmi.pyx"])]
-)
+    ext_modules=cythonize("DocumentFeatureSelection/pmi/pmi_cython.pyx"),
+    include_dirs = [numpy.get_include()]
+)
\ No newline at end of file

From a122dc32d147f456c942e8c8af1aa4ef94de316c Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 28 Sep 2016 09:37:13 +0900
Subject: [PATCH 07/12] added relative compile system of cython codes

---
 setup.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 4431201..a023c26 100644
--- a/setup.py
+++ b/setup.py
@@ -7,10 +7,31 @@
 
 import sys
 from setuptools import setup, find_packages
-import numpy
-from Cython.Build import cythonize
-from distutils.extension import Extension
-from Cython.Distutils import build_ext
+
+# Flags to compile Cython code or use already compiled code
+# --------------------------------------------------------------------------------------------------------
+try:
+    from Cython.Build import cythonize
+    from distutils.extension import Extension
+    from Cython.Distutils import build_ext
+    import numpy
+except ImportError:
+    use_cython = False
+else:
+    use_cython = True
+
+cmdclass = { }
+ext_modules = [ ]
+if use_cython:
+    ext_modules += [
+        Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ]),
+    ]
+    cmdclass.update({ 'build_ext': build_ext })
+else:
+    ext_modules += [
+        Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.c" ]),
+    ]
+# --------------------------------------------------------------------------------------------------------
 
 
 python_version = sys.version_info
@@ -18,6 +39,8 @@
 if python_version >= (3, 0, 0):
     install_requires = ['six', 'setuptools>=1.0', 'joblib',
                         'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn']
+else:
+    raise Exception('This package does NOT support Python2.x')
 
 try:
     import pypandoc
@@ -53,7 +76,7 @@
     install_requires=install_requires,
     setup_requires=['six', 'setuptools>=1.0'],
     classifiers=[],
-    cmdclass={'build_ext': build_ext},
-    ext_modules=cythonize("DocumentFeatureSelection/pmi/pmi_cython.pyx"),
+    cmdclass=cmdclass,
+    ext_modules=ext_modules,
     include_dirs = [numpy.get_include()]
 )
\ No newline at end of file

From 3d541d565ce848ea03c2dd366f4a5c31fd072e5d Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 28 Sep 2016 11:25:06 +0900
Subject: [PATCH 08/12] added cython on soa

---
 DocumentFeatureSelection/interface.py       |   3 +-
 DocumentFeatureSelection/pmi/PMI_python3.py |   7 +-
 DocumentFeatureSelection/pmi/pmi_cython.pyx |  14 ++-
 DocumentFeatureSelection/soa/soa_cython.pyx |  76 ++++++++++++
 DocumentFeatureSelection/soa/soa_python3.py | 126 +++++++++++---------
 examples/check_performance.py               |  10 +-
 examples/example_python3.py                 |  55 ++++++++-
 setup.py                                    |   3 +-
 8 files changed, 214 insertions(+), 80 deletions(-)
 create mode 100644 DocumentFeatureSelection/soa/soa_cython.pyx

diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py
index 3889208..0673960 100644
--- a/DocumentFeatureSelection/interface.py
+++ b/DocumentFeatureSelection/interface.py
@@ -66,7 +66,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
             scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_,
                                                        unit_distribution=matrix_data_object.n_docs_distribution,
                                                        n_jobs=n_jobs,
-                                                       joblib_backend=backend_strategy)
+                                                       joblib_backend=backend_strategy,
+                                                       use_cython=use_cython)
             assert isinstance(scored_sparse_matrix, csr_matrix)
 
     elif method == 'soa' and matrix_form == 'term_freq':
diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py
index 8932f34..92d0006 100644
--- a/DocumentFeatureSelection/pmi/PMI_python3.py
+++ b/DocumentFeatureSelection/pmi/PMI_python3.py
@@ -72,9 +72,6 @@ def pmi(X:csr_matrix,
         temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
         score = temp1 + temp2 + temp3 + temp4
 
-        if score < 0:
-            raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.')
-
         return score
 
 
@@ -105,12 +102,12 @@ def fit_transform(self, X,
             import pyximport; pyximport.install()
             from DocumentFeatureSelection.pmi.pmi_cython import main
             logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
-            print(X.toarray())
             pmi_score_csr_source = main(X=X,
                                         n_docs_distribution=n_docs_distribution,
                                         sample_range=sample_range,
                                         feature_range=feature_range,
-                                        n_total_doc=n_total_document)
+                                        n_total_doc=n_total_document,
+                                        verbose=False)
 
         else:
             self.pmi = pmi
diff --git a/DocumentFeatureSelection/pmi/pmi_cython.pyx b/DocumentFeatureSelection/pmi/pmi_cython.pyx
index 5acbde5..856ee6c 100644
--- a/DocumentFeatureSelection/pmi/pmi_cython.pyx
+++ b/DocumentFeatureSelection/pmi/pmi_cython.pyx
@@ -1,13 +1,15 @@
 import math
 import scipy
 cimport numpy as np
+from cpython cimport bool
 
 cdef float pmi(np.ndarray[np.float64_t, ndim=2] X,
         int n_samples,
         np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
         int n_total_doc,
         int feature_index,
-        int sample_index):
+        int sample_index,
+        bool verbose):
     """get PMI score for given feature & sample index
     """
     cdef i
@@ -33,9 +35,8 @@ cdef float pmi(np.ndarray[np.float64_t, ndim=2] X,
         temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2)
         score = temp1 + temp2 + temp3 + temp4
 
-        if score < 0:
-            print(score)
-            raise Exception('PMI score={}. Score under 0 is detected. Something strange in Input matrix. Check your input matrix.'.format(score))
+        if verbose:
+            print('score={}, temp1={}, temp2={}, temp3={}, temp4={}, n11={}, n10={}, n01={}, n00={}, n_total_docs={}'.format(score, temp1, temp2, temp3, temp4, n_11, n_10, n_01, n_00, n_total_doc))
 
         return score
 
@@ -44,7 +45,8 @@ def main(X,
         np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
         int n_total_doc,
         sample_range,
-        feature_range):
+        feature_range,
+        bool verbose=False):
     """What you can do
     - calculate PMI score based on given data.
     - The function returns list of tuple, whose element is (sample_index, feature_index, score)
@@ -61,7 +63,7 @@ def main(X,
         (
             sample_index,
             feature_index,
-            pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index)
+            pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose)
          )
         for sample_index in sample_range
         for feature_index in feature_range
diff --git a/DocumentFeatureSelection/soa/soa_cython.pyx b/DocumentFeatureSelection/soa/soa_cython.pyx
new file mode 100644
index 0000000..19f4635
--- /dev/null
+++ b/DocumentFeatureSelection/soa/soa_cython.pyx
@@ -0,0 +1,76 @@
+import math
+import scipy
+cimport numpy as np
+from cpython cimport bool
+
+cdef float soa(
+    np.ndarray[np.float64_t, ndim=2] X,
+    np.ndarray[np.int64_t, ndim=1] unit_distribution,
+    int n_total_docs,
+    int feature_index,
+    int sample_index, 
+    bool verbose):
+    # X is either of term-frequency matrix per label or document-frequency per label
+
+    matrix_size = X.shape
+    NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
+
+    # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e
+    cdef float freq_w_e = X[sample_index, feature_index]
+    # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e
+    cdef float freq_w_not_e = X[NOT_sample_indexes, feature_index].sum()
+    # freq_e is the number of the unit having specific label e
+    cdef float freq_e = unit_distribution[sample_index]
+    # freq_not_e is the number of the unit NOT having the specific label e
+    cdef float freq_not_e = n_total_docs - freq_e
+    cdef float nominator, denominator, ans, soa_val
+
+    if verbose:
+        print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
+        print('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format(
+            freq_w_e,
+            freq_w_not_e,
+            freq_e,
+            freq_not_e
+        ))
+
+    if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0:
+        return 0.0
+    else:
+        nominator = (float(freq_w_e) * freq_not_e)
+        denominator = (float(freq_e) * freq_w_not_e)
+        ans = nominator / denominator
+        soa_val = math.log(ans, 2)
+        return soa_val
+
+
+def main(X,
+        np.ndarray[np.int64_t, ndim=1] n_docs_distribution,
+        int n_total_doc,
+        sample_range,
+        feature_range,
+        bool verbose=False):
+    """What you can do
+    - calculate PMI score based on given data.
+    - The function returns list of tuple, whose element is (sample_index, feature_index, score)
+    - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature.
+    """
+
+    cdef int n_samples = X.shape[0]
+
+    if isinstance(X, scipy.sparse.csr_matrix):
+        X = X.toarray()
+
+    cdef int sample_index, feature_index
+    soa_score_csr_source = [
+        (
+            sample_index,
+            feature_index,
+            soa(X, n_samples, n_docs_distribution, feature_index, sample_index, verbose)
+         )
+        for sample_index in sample_range
+        for feature_index in feature_range
+    ]
+    non_zero_soa_score_csr_source = [score_tuple for score_tuple in soa_score_csr_source if not score_tuple[2]==0]
+
+    return non_zero_soa_score_csr_source
\ No newline at end of file
diff --git a/DocumentFeatureSelection/soa/soa_python3.py b/DocumentFeatureSelection/soa/soa_python3.py
index 82776b0..1dddc7a 100644
--- a/DocumentFeatureSelection/soa/soa_python3.py
+++ b/DocumentFeatureSelection/soa/soa_python3.py
@@ -15,11 +15,54 @@
 __author__ = 'kensuke-mi'
 
 
+def soa(X:csr_matrix, unit_distribution:numpy.ndarray,
+        n_total_docs:int,
+        feature_index:int,
+        sample_index:int, verbose=False):
+    # X is either of term-frequency matrix per label or document-frequency per label
+    assert isinstance(X, csr_matrix)
+    assert isinstance(unit_distribution, numpy.ndarray)
+    assert isinstance(feature_index, int)
+    assert isinstance(sample_index, int)
+
+    matrix_size = X.shape
+    NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
+
+    # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e
+    freq_w_e = X[sample_index, feature_index]
+    # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e
+    freq_w_not_e = X[NOT_sample_indexes, feature_index].sum()
+    # freq_e is the number of the unit having specific label e
+    freq_e = unit_distribution[sample_index]
+    # freq_not_e is the number of the unit NOT having the specific label e
+    freq_not_e = n_total_docs - freq_e
+
+    if verbose:
+        logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
+        logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format(
+            freq_w_e,
+            freq_w_not_e,
+            freq_e,
+            freq_not_e
+        ))
+
+    if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0:
+        return 0
+    else:
+        nominator = (float(freq_w_e) * freq_not_e)
+        denominator = (float(freq_e) * freq_w_not_e)
+        ans = nominator / denominator
+        assert isinstance(ans, float)
+        soa_val = math.log(ans, 2)
+        return soa_val
+
+
 class SOA(object):
     def __init__(self):
         pass
 
-    def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, joblib_backend='multiprocessing'):
+    def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False,
+                      joblib_backend='multiprocessing', use_cython:bool=False):
         assert isinstance(X, csr_matrix)
         assert isinstance(unit_distribution, numpy.ndarray)
 
@@ -31,22 +74,34 @@ def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=Fa
         logger.debug(msg='Start calculating SOA with n(process)={}'.format(n_jobs))
         logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1]))
 
-        pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
-            joblib.delayed(self.docId_word_soa)(
-                X=X,
-                unit_distribution=unit_distribution,
-                feature_index=feature_index,
-                sample_index=sample_index,
-                n_total_doc=n_total_document,
-                verbose=verbose
+        if use_cython:
+            import pyximport; pyximport.install()
+            from DocumentFeatureSelection.soa.soa_cython import main
+            logger.warning(msg='n_jobs parameter is invalid when use_cython=True')
+            soa_score_csr_source = main(X=X,
+                                        n_docs_distribution=unit_distribution,
+                                        n_total_doc=n_total_document,
+                                        sample_range=sample_range,
+                                        feature_range=feature_range,
+                                        verbose=False)
+        else:
+            self.soa = soa
+            soa_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)(
+                joblib.delayed(self.docId_word_soa)(
+                    X=X,
+                    unit_distribution=unit_distribution,
+                    feature_index=feature_index,
+                    sample_index=sample_index,
+                    n_total_doc=n_total_document,
+                    verbose=verbose
+                )
+                for sample_index in sample_range
+                for feature_index in feature_range
             )
-            for sample_index in sample_range
-            for feature_index in feature_range
-        )
 
-        row_list = [t[0] for t in pmi_score_csr_source]
-        col_list = [t[1] for t in pmi_score_csr_source]
-        data_list = [t[2] for t in pmi_score_csr_source]
+        row_list = [t[0] for t in soa_score_csr_source]
+        col_list = [t[1] for t in soa_score_csr_source]
+        data_list = [t[2] for t in soa_score_csr_source]
 
         soa_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)),
                                              shape=(X.shape[0],
@@ -77,44 +132,3 @@ def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray,
             verbose=verbose
         )
         return sample_index, feature_index, soa_score
-
-    def soa(self, X:csr_matrix, unit_distribution:numpy.ndarray,
-            n_total_docs:int,
-            feature_index:int,
-            sample_index:int, verbose=False):
-        # X is either of term-frequency matrix per label or document-frequency per label
-        assert isinstance(X, csr_matrix)
-        assert isinstance(unit_distribution, numpy.ndarray)
-        assert isinstance(feature_index, int)
-        assert isinstance(sample_index, int)
-
-        matrix_size = X.shape
-        NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index]
-
-        # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e
-        freq_w_e = X[sample_index, feature_index]
-        # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e
-        freq_w_not_e = X[NOT_sample_indexes, feature_index].sum()
-        # freq_e is the number of the unit having specific label e
-        freq_e = unit_distribution[sample_index]
-        # freq_not_e is the number of the unit NOT having the specific label e
-        freq_not_e = n_total_docs - freq_e
-
-        if verbose:
-            logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index))
-            logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format(
-                freq_w_e,
-                freq_w_not_e,
-                freq_e,
-                freq_not_e
-            ))
-
-        if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0:
-            return 0
-        else:
-            nominator = (float(freq_w_e) * freq_not_e)
-            denominator = (float(freq_e) * freq_w_not_e)
-            ans = nominator / denominator
-            assert isinstance(ans, float)
-            soa_val = math.log(ans, 2)
-            return soa_val
\ No newline at end of file
diff --git a/examples/check_performance.py b/examples/check_performance.py
index b254aeb..c297937 100644
--- a/examples/check_performance.py
+++ b/examples/check_performance.py
@@ -21,7 +21,7 @@ def pmi_with_parallel(input_corpus):
         joblib_backend='multiprocessing'
     )
     elapsed_time = time.time() - start
-    print ("elapsed_time with multiprocess:{}".format(elapsed_time)) + "[sec]"
+    print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
 
 
 #@profile
@@ -35,7 +35,7 @@ def pmi_with_threading(input_corpus):
         joblib_backend='threading'
     )
     elapsed_time = time.time() - start
-    print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]"
+    print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
 
 
 def pmi_with_cython(input_corpus):
@@ -48,7 +48,7 @@ def pmi_with_cython(input_corpus):
         use_cython=True
     )
     elapsed_time = time.time() - start
-    print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]"
+    print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
 
 from nltk.corpus import gutenberg
 from nltk.corpus import webtext
@@ -68,5 +68,5 @@ def pmi_with_cython(input_corpus):
     }
 
 pmi_with_cython(input_corpus)
-pmi_with_parallel(input_corpus)
-pmi_with_threading(input_corpus)
+#pmi_with_parallel(input_corpus)
+#pmi_with_threading(input_corpus)
diff --git a/examples/example_python3.py b/examples/example_python3.py
index 19241c7..7cde041 100644
--- a/examples/example_python3.py
+++ b/examples/example_python3.py
@@ -5,7 +5,7 @@
 import logging
 import pprint
 logger = logging.getLogger('sample usage')
-logger.level = logging.DEBUG
+logger.level = logging.ERROR
 
 
 # ======================================================================================================
@@ -31,6 +31,8 @@
     ]
 }
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# tf idf
 
 tf_idf_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
@@ -39,6 +41,8 @@
     n_jobs=5
 )
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# pmi
 pmi_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
     method='pmi',
@@ -59,7 +63,8 @@
 )
 pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary())
 
-
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# soa
 soa_scored_object = interface.run_feature_selection(
     input_dict=input_dict,
     method='soa',
@@ -68,7 +73,18 @@
 )
 pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary())
 
+soa_scored_object_cython = interface.run_feature_selection(
+    input_dict=input_dict,
+    method='soa',
+    ngram=1,
+    n_jobs=1,
+    use_cython=True
+)
+pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary())
 
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# bns
 input_dict = {
     "positive": [
         ["I", "aa", "aa", "aa", "aa", "aa"],
@@ -108,7 +124,18 @@
     ]
 }
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# tf idf
+tf_idf_scored_object = interface.run_feature_selection(
+    input_dict=input_dict_tuple_feature,
+    method='tf_idf',
+    n_jobs=5
+)
+pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
+
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# pmi
 pmi_scored_object = interface.run_feature_selection(
     input_dict=input_dict_tuple_feature,
     method='pmi',
@@ -117,6 +144,17 @@
 pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
 
 
+pmi_scored_object_cython = interface.run_feature_selection(
+    input_dict=input_dict_tuple_feature,
+    method='pmi',
+    n_jobs=1,
+    use_cython=True
+)
+pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary())
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# soa
 soa_scored_object = interface.run_feature_selection(
     input_dict=input_dict_tuple_feature,
     method='soa',
@@ -125,14 +163,19 @@
 pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary())
 
 
-tf_idf_scored_object = interface.run_feature_selection(
+soa_scored_object_cython = interface.run_feature_selection(
     input_dict=input_dict_tuple_feature,
-    method='tf_idf',
-    n_jobs=5
+    method='soa',
+    n_jobs=1,
+    use_cython=True
 )
-pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())
+pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary())
+
+
 
 
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# bns
 input_dict_tuple_feature = {
     "positive": [
         [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ],
diff --git a/setup.py b/setup.py
index a023c26..2e6fef7 100644
--- a/setup.py
+++ b/setup.py
@@ -24,7 +24,8 @@
 ext_modules = [ ]
 if use_cython:
     ext_modules += [
-        Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ]),
+        Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ],),
+        Extension("DocumentFeatureSelection.soa.soa_cython", [ "DocumentFeatureSelection/soa/soa_cython.pyx" ],)
     ]
     cmdclass.update({ 'build_ext': build_ext })
 else:

From c806a24199b4bf9dd60f470cee6886dc68bacc6f Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 28 Sep 2016 11:28:47 +0900
Subject: [PATCH 09/12] updated README.md

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fc1a476..f4fe714 100644
--- a/README.md
+++ b/README.md
@@ -152,5 +152,9 @@ Removed a bug when calling n_gram method of DataConverter
 
 * A bug in calculating TF-IDF score, this bug was resolved.
 
-## 1.3 2016/9/
+## 1.3 2016/9/28
+
+* Resolved bottleneck poins in pre-processing
+    * Introduced dict-vectorising in ScikitLearn
+    * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/example_python3.py`
 

From 28e2ae0d61ec290f3d79e3a3d7d7db02ff5bb560 Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 28 Sep 2016 12:35:34 +0900
Subject: [PATCH 10/12] resolved a bug in soa_cython.pyx

---
 DocumentFeatureSelection/soa/soa_cython.pyx | 2 +-
 examples/check_performance.py               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/DocumentFeatureSelection/soa/soa_cython.pyx b/DocumentFeatureSelection/soa/soa_cython.pyx
index 19f4635..7ab7a97 100644
--- a/DocumentFeatureSelection/soa/soa_cython.pyx
+++ b/DocumentFeatureSelection/soa/soa_cython.pyx
@@ -66,7 +66,7 @@ def main(X,
         (
             sample_index,
             feature_index,
-            soa(X, n_samples, n_docs_distribution, feature_index, sample_index, verbose)
+            soa(X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose)
          )
         for sample_index in sample_range
         for feature_index in feature_range
diff --git a/examples/check_performance.py b/examples/check_performance.py
index c297937..999093b 100644
--- a/examples/check_performance.py
+++ b/examples/check_performance.py
@@ -48,7 +48,7 @@ def pmi_with_cython(input_corpus):
         use_cython=True
     )
     elapsed_time = time.time() - start
-    print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
+    print ("elapsed_time with cython:{} [sec]".format(elapsed_time))
 
 from nltk.corpus import gutenberg
 from nltk.corpus import webtext
@@ -68,5 +68,5 @@ def pmi_with_cython(input_corpus):
     }
 
 pmi_with_cython(input_corpus)
-#pmi_with_parallel(input_corpus)
+pmi_with_parallel(input_corpus)
 #pmi_with_threading(input_corpus)

From 91805e48edd4659b31b16d373e467e23d840ea3f Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 5 Oct 2016 10:04:10 +0900
Subject: [PATCH 11/12] updated setup.py & discription in README

---
 README.md | 22 ++++++++++++++++++----
 setup.py  | 36 +++++++++++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index f4fe714..c957f55 100644
--- a/README.md
+++ b/README.md
@@ -92,9 +92,6 @@ or
 
 ```George Forman, "An Extensive Empirical Study of Feature Selection Metrics for Text Classification",Journal of Machine Learning Research 3 (2003) 1289-1305```
  
- 
-
-
 
 # Requirement
 
@@ -107,6 +104,21 @@ or
 
 `python setup.py install`
 
+### Note
+
+You might see error message during running this command, such as
+
+```
+We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution.
+```
+
+This is because `setup.py` tries to instal numpy and scipy with `pip`, however it fails. 
+We need numpy and scipy before we install `scikit-learn`.
+
+In this case, you take following choice
+
+* You install `numpy` and `scipy` manually
+* You use `anaconda` python distribution. Please visit [their site](https://www.continuum.io/downloads).
 
 # Examples
 
@@ -157,4 +169,6 @@ Removed a bug when calling n_gram method of DataConverter
 * Resolved bottleneck poins in pre-processing
     * Introduced dict-vectorising in ScikitLearn
     * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/example_python3.py`
-
+* Performance
+    * Cython PMI takes 11.87 sec.
+    * Python multiprocessing PMI takes 513.541 sec. (8.55 min.)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 2e6fef7..b3e21f9 100644
--- a/setup.py
+++ b/setup.py
@@ -6,15 +6,16 @@
 __version__ = '1.3'
 
 import sys
+import pip
 from setuptools import setup, find_packages
+from distutils.extension import Extension
+
 
-# Flags to compile Cython code or use already compiled code
 # --------------------------------------------------------------------------------------------------------
+# Flags to compile Cython code or use already compiled code
 try:
     from Cython.Build import cythonize
-    from distutils.extension import Extension
     from Cython.Distutils import build_ext
-    import numpy
 except ImportError:
     use_cython = False
 else:
@@ -32,14 +33,34 @@
     ext_modules += [
         Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.c" ]),
     ]
-# --------------------------------------------------------------------------------------------------------
 
+# --------------------------------------------------------------------------------------------------------
+# try to install numpy automatically because sklearn requires the status where numpy is already installed
+try:
+    import numpy
+except ImportError:
+    use_numpy_include_dirs = False
+    try:
+        pip.main(['install', 'numpy'])
+    except:
+        raise Exception('We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution.')
+# --------------------------------------------------------------------------------------------------------
+# try to install scipy automatically because sklearn requires the status where scipy is already installed
+try:
+    import scipy
+except ImportError:
+    use_numpy_include_dirs = False
+    try:
+        pip.main(['install', 'scipy'])
+    except:
+        raise Exception('We failed to install scipy automatically. Try installing scipy manually or Try anaconda distribution.')
+# --------------------------------------------------------------------------------------------------------
 
 python_version = sys.version_info
 
 if python_version >= (3, 0, 0):
-    install_requires = ['six', 'setuptools>=1.0', 'joblib',
-                        'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn']
+    install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy',
+                        'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython']
 else:
     raise Exception('This package does NOT support Python2.x')
 
@@ -75,7 +96,8 @@
     zip_safe=False,
     test_suite='tests.all_tests.suite',
     install_requires=install_requires,
-    setup_requires=['six', 'setuptools>=1.0'],
+    tests_require=install_requires,
+    setup_requires=['six', 'setuptools>=1.0', 'pip'],
     classifiers=[],
     cmdclass=cmdclass,
     ext_modules=ext_modules,

From 6912bd23d58f2ea859c794aa6c48883bfdb60eab Mon Sep 17 00:00:00 2001
From: Kensuke-Mitsuzawa <kensuke.mit@gmail.com>
Date: Wed, 5 Oct 2016 10:05:25 +0900
Subject: [PATCH 12/12] updated version tag

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b3e21f9..4a91f12 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 """
 
 __author__ = 'kensuke-mi'
-__version__ = '1.3'
+__version__ = '1.3.1'
 
 import sys
 import pip