From 114c644143a1066dcf48868bf01369b6b1056903 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 21 Sep 2016 23:13:46 +0900 Subject: [PATCH 01/12] refactored PMI code --- .../common/crs_matrix_constructor.py | 60 +++++++++++------- .../common/data_converter.py | 59 +++++++++-------- .../common/labeledMultiDocs2labeledDocsSet.py | 40 +++++++----- DocumentFeatureSelection/common/utils.py | 39 ++++++++---- DocumentFeatureSelection/models.py | 28 ++++---- DocumentFeatureSelection/pmi/PMI_python3.py | 20 ++++-- examples/check_performance.py.lprof | Bin 0 -> 265 bytes 7 files changed, 150 insertions(+), 96 deletions(-) create mode 100644 examples/check_performance.py.lprof diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py index 7050b36..9d25eb0 100644 --- a/DocumentFeatureSelection/common/crs_matrix_constructor.py +++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py @@ -3,6 +3,8 @@ import joblib import sys import logging +import numpy +from typing import List, Tuple from scipy.sparse import csr_matrix logging.basicConfig(format='%(asctime)s %(message)s', @@ -15,37 +17,48 @@ python_version = sys.version_info __author__ = 'kensuke-mi' -PosTuple = namedtuple('PosTuple', ('doc_id', 'word_id', 'document_frequency')) + +class PosTuple(object): + __slots__ = ['doc_id', 'word_id', 'document_frequency'] + def __init__(self, doc_id, word_id, document_frequency): + self.doc_id = doc_id + self.word_id = word_id + self.document_frequency = document_frequency + + PARAM_JOBLIB_BACKEND = ['multiprocessing', 'threading'] -def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary): - assert isinstance(vocaburary, dict) - try: - col_value = vocaburary[word] - except KeyError: - print() +def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy.ndarray)->numpy.array: + """* what you can do + - You get array of [document_id, feature_id, value(frequency)] + """ + assert isinstance(vocaburary, numpy.ndarray) + col_element = vocaburary[numpy.where(vocaburary['key']==word)] + assert len(col_element) == 1 + col_value = col_element[0]['value'] # df value is word frequency in documents df_value = doc_freq - return PosTuple(doc_id, col_value, df_value) + return numpy.array([doc_id, col_value, df_value]) + +def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray: + + value_pairs = numpy.array([ + get_data_col_row_values(doc_id=doc_id, word=key_value_tuple[0], doc_freq=key_value_tuple[1], vocaburary=vocabulary) + for key_value_tuple + in doc_freq_obj]) -def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj, vocabulary): - value_pairs = [ - get_data_col_row_values(doc_id=doc_id, word=word, doc_freq=freq, vocaburary=vocabulary) - for word, freq - in doc_freq_obj.items() - ] - assert isinstance(value_pairs, list) return value_pairs -def make_csr_list(value_position_list): + +def make_csr_list(value_position_list:List[numpy.array])->Tuple[List[int], List[int], List[int]]: data = [] row = [] col = [] for position_tuple in value_position_list: - row.append(position_tuple.doc_id) - col.append(position_tuple.word_id) - data.append(position_tuple.document_frequency) + row.append(position_tuple[0]) + col.append(position_tuple[1]) + data.append(position_tuple[2]) return row, col, data @@ -74,7 +87,7 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend)) assert isinstance(feature_frequency, list) - assert isinstance(vocabulary, dict) + assert isinstance(vocabulary, (numpy.ndarray, numpy.array)) assert isinstance(n_jobs, int) logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs)) @@ -86,11 +99,12 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back vocabulary ) for doc_id, doc_freq_obj in enumerate(feature_frequency) - ) + ) # type: List[numpy.ndarray] + + # make 2-d list into 1-d list value_position_list = sorted( [l for set in set_value_position_list for l in set], - key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2]) - ) + key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2])) row, col, data = make_csr_list(value_position_list) diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index 38c4eef..6b05cdb 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -12,6 +12,8 @@ from DocumentFeatureSelection import init_logger import logging import sys +import numpy +import pickle from typing import Dict, List, Tuple, Union, Any python_version = sys.version_info logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) @@ -66,11 +68,11 @@ def __check_data_structure(self, labeled_documents): return True - def count_term_frequency_distribution(self, labeled_documents, label2id_dict): + def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array): """Count term-distribution per label. """ assert isinstance(labeled_documents, dict) - assert isinstance(label2id_dict, dict) + assert isinstance(label2id, numpy.ndarray) # count total term-frequency per label term_frequency_distribution = { @@ -83,11 +85,12 @@ def count_term_frequency_distribution(self, labeled_documents, label2id_dict): term_frequency_distribution_list = [0] * len(labeled_documents.keys()) for label_string, n_doc in term_frequency_distribution.items(): - term_frequency_distribution_list[label2id_dict[label_string]] = n_doc + term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + term_frequency_distribution_list[term_index] = n_doc - return term_frequency_distribution_list + return numpy.array(term_frequency_distribution_list, dtype='i8') - def count_document_distribution(self, labeled_documents, label2id_dict): + def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array)->numpy.ndarray: """This method count n(docs) per label. :param labeled_documents: @@ -95,7 +98,7 @@ def count_document_distribution(self, labeled_documents, label2id_dict): :return: """ assert isinstance(labeled_documents, dict) - assert isinstance(label2id_dict, dict) + assert isinstance(label2id, numpy.ndarray) # count n(docs) per label n_doc_distribution = { @@ -108,9 +111,10 @@ def count_document_distribution(self, labeled_documents, label2id_dict): n_doc_distribution_list = [0] * len(labeled_documents.keys()) for label_string, n_doc in n_doc_distribution.items(): - n_doc_distribution_list[label2id_dict[label_string]] = n_doc + docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + n_doc_distribution_list[docs_index] = n_doc - return n_doc_distribution_list + return numpy.array(n_doc_distribution_list, dtype='i8') def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'): """This function makes TERM-frequency matrix for TF-IDF calculation. @@ -138,39 +142,40 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, # make set of tuples to construct csr_matrix row, col, data = crs_matrix_constructor.preprocess_csr_matrix( feature_frequency=set_document_information.feature_frequency, - vocabulary=set_document_information.feature2id_dict, + vocabulary=set_document_information.feature2id, n_jobs=n_jobs, joblib_backend=joblib_backend ) logger.debug(msg='Finished pre-processing before CSR matrix') csr_matrix_ = crs_matrix_constructor.make_csr_objects( row=row, col=col, data=data, - n_feature=max(set_document_information.feature2id_dict.values())+1, + n_feature=max(set_document_information.feature2id.values())+1, n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, - label2id_dict=set_document_information.label2id_dict + label2id=set_document_information.label2id ) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, - label2id_dict=set_document_information.label2id_dict + label2id=set_document_information.label2id ) assert isinstance(csr_matrix_, csr_matrix) - assert isinstance(set_document_information.label2id_dict, dict) - assert isinstance(set_document_information.feature2id_dict, dict) + assert isinstance(set_document_information.label2id, dict) + assert isinstance(set_document_information.label2id, dict) assert isinstance(n_docs_distribution, list) return DataCsrMatrix( csr_matrix_, - set_document_information.label2id_dict, - set_document_information.feature2id_dict, + set_document_information.label2id, + set_document_information.feature2id, n_docs_distribution, term_frequency_distribution) - def labeledMultiDocs2DocFreqMatrix(self, labeled_documents, + def labeledMultiDocs2DocFreqMatrix(self, + labeled_documents:Dict[str,List[Any]], ngram:int=1, n_jobs:int=1, joblib_backend:str='auto')->DataCsrMatrix: @@ -229,7 +234,7 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents, n_jobs=n_jobs, joblib_backend=joblib_backend) assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation) - logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id_dict))) + logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id))) if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000: joblib_backend = 'threading' if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000: @@ -238,35 +243,35 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents, # make set of tuples to construct csr_matrix row, col, data = crs_matrix_constructor.preprocess_csr_matrix( feature_frequency=set_document_information.feature_frequency, - vocabulary=set_document_information.feature2id_dict, + vocabulary=set_document_information.feature2id, n_jobs=n_jobs, joblib_backend=joblib_backend ) logger.debug(msg='Finished pre-processing before CSR matrix') csr_matrix_ = crs_matrix_constructor.make_csr_objects( row=row, col=col, data=data, - n_feature=max(set_document_information.feature2id_dict.values())+1, + n_feature=len(set_document_information.feature2id)+1, n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, - label2id_dict=set_document_information.label2id_dict + label2id=set_document_information.label2id ) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, - label2id_dict=set_document_information.label2id_dict + label2id=set_document_information.label2id ) assert isinstance(csr_matrix_, csr_matrix) - assert isinstance(set_document_information.label2id_dict, dict) - assert isinstance(set_document_information.feature2id_dict, dict) - assert isinstance(n_docs_distribution, list) + assert isinstance(set_document_information.label2id, numpy.ndarray) + assert isinstance(set_document_information.feature2id, numpy.ndarray) + assert isinstance(n_docs_distribution, numpy.ndarray) return DataCsrMatrix( csr_matrix_, - set_document_information.label2id_dict, - set_document_information.feature2id_dict, + set_document_information.label2id, + set_document_information.feature2id, n_docs_distribution, term_frequency_distribution) diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py index d015991..c7c6b8c 100644 --- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py +++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py @@ -6,11 +6,13 @@ from typing import Dict, List, Tuple, Any, Union import logging import joblib +import numpy +import pickle logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) N_FEATURE_SWITCH_STRATEGY = 1000000 def generate_document_dict(document_key:str, - documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[str, int]]: + documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[Union[str,bytes], int]]: """This function gets Document-frequency count in given list of documents """ assert isinstance(documents, list) @@ -18,12 +20,6 @@ def generate_document_dict(document_key:str, document_frequencies = Counter() for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys()) document_frequency_dict = dict(document_frequencies) - ''' - V = set([t for d in documents for t in d]) - document_frequency_dict = {} - for v in V: - binary_count = [1 for d in documents if v in d] - document_frequency_dict[v] = sum(binary_count)''' assert isinstance(document_frequency_dict, dict) return (document_key, document_frequency_dict) @@ -86,22 +82,25 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple if type_flag == set(['str']): feature_list = list(set(utils.flatten(labeled_documents.values()))) feature_list = sorted(feature_list) + max_lenght = max([len(s) for s in feature_list]) elif type_flag == set(['tuple']): + # make tuple into string feature_list = list(set(utils.flatten(labeled_documents.values()))) - feature_list = sorted(feature_list) + feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)] + max_lenght = max([len(s) for s in feature_list]) + 10 else: raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag)) - feature2id_dict = {t: index for index, t in enumerate(feature_list)} + feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: array # make label: id dictionary structure label2id_dict = {} - # make list of document-frequency + # list of document-frequency array feature_frequency = [] - if joblib_backend == 'auto' and len(feature2id_dict) >= N_FEATURE_SWITCH_STRATEGY: + if joblib_backend == 'auto' and len(feature2id) >= N_FEATURE_SWITCH_STRATEGY: joblib_backend = 'threading' - if joblib_backend == 'auto' and len(feature2id_dict) < N_FEATURE_SWITCH_STRATEGY: + if joblib_backend == 'auto' and len(feature2id) < N_FEATURE_SWITCH_STRATEGY: joblib_backend = 'multiprocessing' counted_frequency = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( @@ -112,6 +111,17 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple for doc_key_freq_tuple in counted_frequency: label2id_dict.update({doc_key_freq_tuple[0]: document_index}) document_index += 1 - feature_frequency.append(doc_key_freq_tuple[1]) - - return SetDocumentInformation(feature_frequency, label2id_dict, feature2id_dict) \ No newline at end of file + if type_flag == set(['str']): + doc_freq = doc_key_freq_tuple[1] + elif type_flag == set(['tuple']): + doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())} + else: + raise Exception() + feature_frequency.append( + numpy.array( + list(doc_freq.items()), + dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')] + )) + label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10 + label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')]) + return SetDocumentInformation(feature_frequency, label2id, feature2id) \ No newline at end of file diff --git a/DocumentFeatureSelection/common/utils.py b/DocumentFeatureSelection/common/utils.py index 1a0ba52..02e3727 100644 --- a/DocumentFeatureSelection/common/utils.py +++ b/DocumentFeatureSelection/common/utils.py @@ -9,7 +9,10 @@ import logging import collections import joblib +import typing +import numpy import sys +import pickle python_version = sys.version_info logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) @@ -53,7 +56,7 @@ def __get_value_index(row_index, column_index, weight_csr_matrix, verbose=False) return value -def make_non_zero_information(weight_csr_matrix): +def make_non_zero_information(weight_csr_matrix:csr_matrix): """Construct Tuple of matrix value. Return value is array of ROW_COL_VAL namedtuple. :param weight_csr_matrix: @@ -80,19 +83,31 @@ def make_non_zero_information(weight_csr_matrix): def get_label(row_col_val_tuple, label_id): assert isinstance(row_col_val_tuple, ROW_COL_VAL) - assert isinstance(label_id, dict) + assert isinstance(label_id, numpy.ndarray) + + label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value'] + try: + original_label = pickle.loads(label) + except (pickle.UnpicklingError, KeyError): + original_label = label.decode('utf-8') - return label_id[row_col_val_tuple.row] + return original_label def get_word(row_col_val_tuple, vocabulary): assert isinstance(row_col_val_tuple, ROW_COL_VAL) - assert isinstance(vocabulary, dict) + assert isinstance(vocabulary, numpy.ndarray) - return vocabulary[row_col_val_tuple.col] + vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value'] + try: + original_vocab = pickle.loads(vocab) + except (pickle.UnpicklingError, KeyError): + original_vocab = vocab.decode('utf-8') + return original_vocab -def SUB_FUNC_feature_extraction(row_col_val_tuple, id2label, id2vocab): + +def SUB_FUNC_feature_extraction(row_col_val_tuple:typing.Tuple[int,int,int], id2label:numpy.ndarray, id2vocab:numpy.ndarray): """This function returns PMI score between label and words. Input csr matrix must be 'document-frequency' matrix, where records #document that word appears in document set. @@ -139,19 +154,16 @@ def get_feature_dictionary(weighted_matrix, vocabulary, label_group_dict, n_jobs :param bool cut_zero: return all result or not. If cut_zero = True, the method cuts zero features. """ assert isinstance(weighted_matrix, csr_matrix) - assert isinstance(vocabulary, dict) - assert isinstance(label_group_dict, dict) + assert isinstance(vocabulary, numpy.ndarray) + assert isinstance(label_group_dict, numpy.ndarray) assert isinstance(n_jobs, int) logger.debug(msg='Start making scored dictionary object from scored matrix') logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1])) value_index_items = make_non_zero_information(weighted_matrix) - id2label = {id:label for label, id in label_group_dict.items()} - if python_version > (3, 0, 0): - id2vocab = {id:voc for voc, id in vocabulary.items()} - else: - id2vocab = {id:voc for voc, id in vocabulary.viewitems()} + id2label = numpy.array([(element['value'], element['key']) for element in label_group_dict], dtype=[('key', 'CB&`T+UCC5a`)QzlQ*Y@AX%MWctkBr`WN zFTEJd5lPNSP0o%lNG(dsFUn2KOHS1*sGQ<0-Xm0yn;Bo8S&|W7l2MeJm;%%>rF}~5 z6b)}lZ$|IK?Nhv^fHVt`mIl&{Kw1V$%R*^6Z$>Z2n?FU{r=(`Ed-L=N!c8nlEK1DD zNzH*8$OqJL4QK#ARHFcpW&rX9p|lWGTo`EJ#pGBCpn;`5ETwswB~#q(HXQrDVC&o7 H(j+|qmi}M1 literal 0 HcmV?d00001 From fdb3c488b0c1741f0058f82e0c53be272352a670 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 21 Sep 2016 23:38:03 +0900 Subject: [PATCH 02/12] refactored tf-idf --- .../common/crs_matrix_constructor.py | 4 +- .../common/data_converter.py | 8 ++-- .../common/labeledMultiDocs2labeledDocsSet.py | 39 +++++++++++++++---- DocumentFeatureSelection/soa/soa_python3.py | 19 ++++++--- examples/check_performance.py | 4 +- 5 files changed, 53 insertions(+), 21 deletions(-) diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py index 9d25eb0..f599c7f 100644 --- a/DocumentFeatureSelection/common/crs_matrix_constructor.py +++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py @@ -44,7 +44,7 @@ def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray: value_pairs = numpy.array([ - get_data_col_row_values(doc_id=doc_id, word=key_value_tuple[0], doc_freq=key_value_tuple[1], vocaburary=vocabulary) + get_data_col_row_values(doc_id=doc_id, word=key_value_tuple['key'], doc_freq=key_value_tuple['value'], vocaburary=vocabulary) for key_value_tuple in doc_freq_obj]) @@ -87,7 +87,7 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend)) assert isinstance(feature_frequency, list) - assert isinstance(vocabulary, (numpy.ndarray, numpy.array)) + assert isinstance(vocabulary, numpy.ndarray) assert isinstance(n_jobs, int) logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs)) diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index 6b05cdb..986711a 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -149,7 +149,7 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, logger.debug(msg='Finished pre-processing before CSR matrix') csr_matrix_ = crs_matrix_constructor.make_csr_objects( row=row, col=col, data=data, - n_feature=max(set_document_information.feature2id.values())+1, + n_feature=len(set_document_information.feature2id)+1, n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label @@ -164,9 +164,9 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, ) assert isinstance(csr_matrix_, csr_matrix) - assert isinstance(set_document_information.label2id, dict) - assert isinstance(set_document_information.label2id, dict) - assert isinstance(n_docs_distribution, list) + assert isinstance(set_document_information.label2id, numpy.ndarray) + assert isinstance(set_document_information.label2id, numpy.ndarray) + assert isinstance(n_docs_distribution, numpy.ndarray) return DataCsrMatrix( csr_matrix_, set_document_information.label2id, diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py index c7c6b8c..267067c 100644 --- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py +++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py @@ -35,8 +35,20 @@ def multiDocs2TermFreqInfo(labeled_documents): vocabulary_list = list(set(utils.flatten(labeled_documents.values()))) vocabulary_list = sorted(vocabulary_list) + type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) + if type_flag == set(['str']): + feature_list = list(set(utils.flatten(labeled_documents.values()))) + feature_list = sorted(feature_list) + max_lenght = max([len(s) for s in feature_list]) + elif type_flag == set(['tuple']): + # make tuple into string + feature_list = list(set(utils.flatten(labeled_documents.values()))) + feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)] + max_lenght = max([len(s) for s in feature_list]) + 10 + else: + raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag)) - vocaburary2id_dict = {t: index for index, t in enumerate(vocabulary_list)} + feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: ndarray # make label: id dictionary structure label2id_dict = {} @@ -45,15 +57,28 @@ def multiDocs2TermFreqInfo(labeled_documents): document_index = 0 for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]): - words_in_docs = utils.flatten(docs) - feature_frequency.append(dict(Counter(words_in_docs))) label2id_dict.update({key: document_index}) document_index += 1 + words_in_docs = utils.flatten(docs) + if type_flag == set(['str']): + term_freq = Counter(words_in_docs) + elif type_flag == set(['tuple']): + term_freq = {pickle.dumps(key): value for key,value in dict(Counter(words_in_docs)).items()} + else: + raise Exception() + + feature_frequency.append( + numpy.array( + [(index_tuple[1], index_tuple[0]) for index_tuple in enumerate(term_freq)], + dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')] + )) - assert isinstance(vocaburary2id_dict, dict) + label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10 + label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')]) + assert isinstance(feature2id, numpy.ndarray) assert isinstance(feature_frequency, list) - assert isinstance(label2id_dict, dict) - return SetDocumentInformation(feature_frequency, label2id_dict, vocaburary2id_dict) + assert isinstance(label2id, numpy.ndarray) + return SetDocumentInformation(feature_frequency, label2id, feature2id) def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str: @@ -91,7 +116,7 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple else: raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag)) - feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: array + feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: ndarray # make label: id dictionary structure label2id_dict = {} diff --git a/DocumentFeatureSelection/soa/soa_python3.py b/DocumentFeatureSelection/soa/soa_python3.py index d4066bd..82776b0 100644 --- a/DocumentFeatureSelection/soa/soa_python3.py +++ b/DocumentFeatureSelection/soa/soa_python3.py @@ -3,6 +3,7 @@ import logging import joblib import math +import numpy logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', @@ -18,9 +19,9 @@ class SOA(object): def __init__(self): pass - def fit_transform(self, X, unit_distribution, n_jobs=1, verbose=False, joblib_backend='multiprocessing'): + def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, joblib_backend='multiprocessing'): assert isinstance(X, csr_matrix) - assert isinstance(unit_distribution, list) + assert isinstance(unit_distribution, numpy.ndarray) matrix_size = X.shape sample_range = list(range(0, matrix_size[0])) @@ -56,11 +57,14 @@ def fit_transform(self, X, unit_distribution, n_jobs=1, verbose=False, joblib_ba return soa_featured_csr_matrix - def docId_word_soa(self, X, unit_distribution, n_total_doc, feature_index, sample_index, verbose=False): + def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray, + n_total_doc:int, + feature_index:int, + sample_index:int, verbose=False): """ """ assert isinstance(X, csr_matrix) - assert isinstance(unit_distribution, list) + assert isinstance(unit_distribution, numpy.ndarray) assert isinstance(feature_index, int) assert isinstance(sample_index, int) @@ -74,10 +78,13 @@ def docId_word_soa(self, X, unit_distribution, n_total_doc, feature_index, sampl ) return sample_index, feature_index, soa_score - def soa(self, X, unit_distribution, n_total_docs, feature_index, sample_index, verbose=False): + def soa(self, X:csr_matrix, unit_distribution:numpy.ndarray, + n_total_docs:int, + feature_index:int, + sample_index:int, verbose=False): # X is either of term-frequency matrix per label or document-frequency per label assert isinstance(X, csr_matrix) - assert isinstance(unit_distribution, list) + assert isinstance(unit_distribution, numpy.ndarray) assert isinstance(feature_index, int) assert isinstance(sample_index, int) diff --git a/examples/check_performance.py b/examples/check_performance.py index 7286412..c45cd00 100644 --- a/examples/check_performance.py +++ b/examples/check_performance.py @@ -9,7 +9,7 @@ logger.level = logging.DEBUG -@profile +#@profile def pmi_with_parallel(input_corpus): logging.debug(msg='With multiprocessing backend') scored_matrix_obj = interface.run_feature_selection( @@ -20,7 +20,7 @@ def pmi_with_parallel(input_corpus): ) -@profile +#@profile def pmi_with_threading(input_corpus): logging.debug(msg='With threading backend') scored_matrix_obj = interface.run_feature_selection( From 80549b076783cf3913c37c1ad641e2d334fd74e8 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Thu, 22 Sep 2016 14:24:09 +0900 Subject: [PATCH 03/12] resoved bug during making term frequency matrix --- DocumentFeatureSelection/bns/bns_python3.py | 16 +++++-- .../common/data_converter.py | 4 +- .../common/labeledMultiDocs2labeledDocsSet.py | 23 ++++++---- DocumentFeatureSelection/interface.py | 4 +- README.md | 16 +++---- examples/example_python3.py | 42 +++++++++++++++++++ setup.py | 2 +- 7 files changed, 84 insertions(+), 23 deletions(-) diff --git a/DocumentFeatureSelection/bns/bns_python3.py b/DocumentFeatureSelection/bns/bns_python3.py index 07009d7..2b96235 100644 --- a/DocumentFeatureSelection/bns/bns_python3.py +++ b/DocumentFeatureSelection/bns/bns_python3.py @@ -26,7 +26,7 @@ def __check_matrix_form(self, X): if n_categories != 2: raise Exception('BNS input must be of 2 categories') - def fit_transform(self, X, y=None, **fit_params): + def fit_transform(self, X:csr_matrix, y=None, **fit_params): assert isinstance(X, csr_matrix) if not 'unit_distribution' in fit_params: @@ -88,7 +88,12 @@ def fit_transform(self, X, y=None, **fit_params): return bns_featured_csr_matrix - def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true_index, verbose=False): + def docId_word_BNS(self, X:csr_matrix, + feature_index:int, + sample_index:int, + unit_distribution:np.ndarray, + true_index:int, + verbose=False): assert isinstance(X, csr_matrix) assert isinstance(feature_index, int) @@ -104,7 +109,12 @@ def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true ) return sample_index, feature_index, bns_score - def bns(self, X, feature_index, sample_index, unit_distribution, true_index=0, verbose=False): + def bns(self, X:csr_matrix, + feature_index:int, + sample_index:int, + unit_distribution:np.ndarray, + true_index:int=0, + verbose:bool=False): if true_index==0: false_index = 1 elif true_index==1: diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index 986711a..dc77b66 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -149,7 +149,7 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, logger.debug(msg='Finished pre-processing before CSR matrix') csr_matrix_ = crs_matrix_constructor.make_csr_objects( row=row, col=col, data=data, - n_feature=len(set_document_information.feature2id)+1, + n_feature=len(set_document_information.feature2id), n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label @@ -250,7 +250,7 @@ def labeledMultiDocs2DocFreqMatrix(self, logger.debug(msg='Finished pre-processing before CSR matrix') csr_matrix_ = crs_matrix_constructor.make_csr_objects( row=row, col=col, data=data, - n_feature=len(set_document_information.feature2id)+1, + n_feature=len(set_document_information.feature2id), n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py index 267067c..a178210 100644 --- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py +++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py @@ -11,18 +11,22 @@ logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) N_FEATURE_SWITCH_STRATEGY = 1000000 +def decode_into_utf8(string:str)->bytes: + """* what you can do + - convert string into etf-8 + """ + return string.encode('utf-8') + def generate_document_dict(document_key:str, - documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Dict[Union[str,bytes], int]]: + documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]: """This function gets Document-frequency count in given list of documents """ assert isinstance(documents, list) word_frequencies = [Counter(document) for document in documents] document_frequencies = Counter() for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys()) - document_frequency_dict = dict(document_frequencies) - assert isinstance(document_frequency_dict, dict) - return (document_key, document_frequency_dict) + return (document_key, document_frequencies) def multiDocs2TermFreqInfo(labeled_documents): @@ -63,13 +67,13 @@ def multiDocs2TermFreqInfo(labeled_documents): if type_flag == set(['str']): term_freq = Counter(words_in_docs) elif type_flag == set(['tuple']): - term_freq = {pickle.dumps(key): value for key,value in dict(Counter(words_in_docs)).items()} + term_freq = {pickle.dumps(key): value for key,value in Counter(words_in_docs).items()} else: raise Exception() feature_frequency.append( numpy.array( - [(index_tuple[1], index_tuple[0]) for index_tuple in enumerate(term_freq)], + [(index_tuple[0], index_tuple[1]) for index_tuple in term_freq.items()], dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')] )) @@ -105,11 +109,12 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple assert len(type_flag)==1 if type_flag == set(['str']): - feature_list = list(set(utils.flatten(labeled_documents.values()))) + # all features are encoded into utf-8 + feature_list = [decode_into_utf8(str) for str in list(set(utils.flatten(labeled_documents.values())))] feature_list = sorted(feature_list) max_lenght = max([len(s) for s in feature_list]) elif type_flag == set(['tuple']): - # make tuple into string + # feature tuples are serialized by pickle feature_list = list(set(utils.flatten(labeled_documents.values()))) feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)] max_lenght = max([len(s) for s in feature_list]) + 10 @@ -137,7 +142,7 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple label2id_dict.update({doc_key_freq_tuple[0]: document_index}) document_index += 1 if type_flag == set(['str']): - doc_freq = doc_key_freq_tuple[1] + doc_freq = {decode_into_utf8(key):value for key, value in doc_key_freq_tuple[1].items()} elif type_flag == set(['tuple']): doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())} else: diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py index a721d26..8e50448 100644 --- a/DocumentFeatureSelection/interface.py +++ b/DocumentFeatureSelection/interface.py @@ -8,6 +8,7 @@ from typing import List, Dict, Any, Union, Tuple from scipy.sparse.csr import csr_matrix import logging +import numpy logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns'] N_FEATURE_SWITCH_STRATEGY = 1000000 @@ -97,7 +98,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] joblib_backend=joblib_backend) assert isinstance(matrix_data_object, DataCsrMatrix) - true_class_index = matrix_data_object.label2id_dict['positive'] + true_class_index = matrix_data_object.label2id_dict[ + numpy.where(matrix_data_object.label2id_dict['key'] == b'positive')]['value'][0] backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) scored_sparse_matrix = BNS().fit_transform( X=matrix_data_object.csr_matrix_, diff --git a/README.md b/README.md index 0100879..fc1a476 100644 --- a/README.md +++ b/README.md @@ -112,17 +112,15 @@ or See scripts in `examples/` -# Performance -With my MacBookPro (late 2015) and version 1.1. - -And input data has 98,600 feature dimensions. +# Change log -- PMI takes around 6 minutes (with both of multiprocessing and multithreading) +For your reference I checked performance under following environment, + +- MacBookPro (late 2015) 3.1 GHz Intel Core i7, 16 GB 1867 MHz DDR3 +- input data has 98,600 feature dimensions. -# Change log - ## 0.6 2016/04/02 supports PMI and TF-IDF under Python3.x @@ -148,7 +146,11 @@ Removed a bug when calling n_gram method of DataConverter * Resolved bottleneck point in pre-processing * Fixed a bug which n_jobs parameter does not work in interface +* PMI takes around 6 minutes (with both of multiprocessing and multithreading) ## 1.2 2016/9/16 * A bug in calculating TF-IDF score, this bug was resolved. + +## 1.3 2016/9/ + diff --git a/examples/example_python3.py b/examples/example_python3.py index e6f279e..8837770 100644 --- a/examples/example_python3.py +++ b/examples/example_python3.py @@ -58,6 +58,26 @@ pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) +input_dict = { + "positive": [ + ["I", "aa", "aa", "aa", "aa", "aa"], + ["bb", "aa", "aa", "aa", "aa", "aa"], + ["I", "aa", "hero", "some", "ok", "aa"] + ], + "negative": [ + ["bb", "bb", "bb"], + ["bb", "bb", "bb"], + ["hero", "ok", "bb"], + ["hero", "cc", "bb"], + ] +} +tf_idf_scored_object = interface.run_feature_selection( + input_dict=input_dict, + method='bns', + n_jobs=1 +) +pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) + # ====================================================================================================== # expert usage @@ -99,4 +119,26 @@ method='tf_idf', n_jobs=5 ) +pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) + + +input_dict_tuple_feature = { + "positive": [ + [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ], + [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ], + [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] + ], + "negative": [ + [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ], + [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ], + [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] + ] +} + + +tf_idf_scored_object = interface.run_feature_selection( + input_dict=input_dict_tuple_feature, + method='bns', + n_jobs=5 +) pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) \ No newline at end of file diff --git a/setup.py b/setup.py index 442317b..6d56571 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ """ __author__ = 'kensuke-mi' -__version__ = '1.2' +__version__ = '1.3' import sys from setuptools import setup, find_packages From 544d94312eac725cccfa12e9131d788ef3aebb17 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Thu, 22 Sep 2016 14:54:36 +0900 Subject: [PATCH 04/12] failed to run cython code with joblib multiprocessing --- .gitignore | 1 + DocumentFeatureSelection/interface.py | 6 ++- DocumentFeatureSelection/pmi/PMI_python3.py | 23 +++++++--- DocumentFeatureSelection/pmi/pmi.pyx | 49 +++++++++++++++++++++ examples/example_python3.py | 3 +- setup.py | 7 ++- 6 files changed, 78 insertions(+), 11 deletions(-) create mode 100644 DocumentFeatureSelection/pmi/pmi.pyx diff --git a/.gitignore b/.gitignore index ed673fe..24909e7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__/ # C extensions *.so +*.c # Distribution / packaging .Python diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py index 8e50448..be84b64 100644 --- a/DocumentFeatureSelection/interface.py +++ b/DocumentFeatureSelection/interface.py @@ -26,7 +26,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] ngram:int=1, n_jobs:int=1, joblib_backend='auto', - matrix_form=None)->ScoredResultObject: + matrix_form=None, + use_cython:bool=False)->ScoredResultObject: if not method in METHOD_NAMES: raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method)) @@ -57,7 +58,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] scored_sparse_matrix = PMI().fit_transform(X=matrix_data_object.csr_matrix_, n_docs_distribution=matrix_data_object.n_docs_distribution, n_jobs=n_jobs, - joblib_backend=backend_strategy) + joblib_backend=backend_strategy, + use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) elif method == 'soa': backend_strategy = decide_joblib_strategy(matrix_data_object.vocabulary) diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py index c5760ee..96f0472 100644 --- a/DocumentFeatureSelection/pmi/PMI_python3.py +++ b/DocumentFeatureSelection/pmi/PMI_python3.py @@ -5,6 +5,7 @@ from __future__ import division from scipy.sparse import csr_matrix from logging import getLogger, StreamHandler + import logging import joblib import math @@ -27,7 +28,12 @@ class PMI(object): def __init__(self): pass - def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_backend='multiprocessing'): + def fit_transform(self, X, + n_docs_distribution, + n_jobs=1, + verbose=False, + joblib_backend='multiprocessing', + use_cython:bool=False): """Main method of PMI class. """ assert isinstance(X, csr_matrix) @@ -41,6 +47,13 @@ def fit_transform(self, X, n_docs_distribution, n_jobs=1, verbose=False, joblib_ logger.debug(msg='Start calculating PMI with n(process)={}'.format(n_jobs)) logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1])) + if use_cython: + import pyximport; pyximport.install() + from DocumentFeatureSelection.pmi import pmi + self.pmi = pmi + else: + self.pmi = self.pmi + pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( joblib.delayed(self.docId_word_PMI)( X=X, @@ -71,7 +84,8 @@ def docId_word_PMI(self, X:csr_matrix, n_total_doc:int, feature_index:int, sample_index:int, - verbose=False): + verbose=False, + use_cython:bool=False): """Calculate PMI score for fit_format() :param X: @@ -81,11 +95,6 @@ def docId_word_PMI(self, X:csr_matrix, :param label: :return: """ - assert isinstance(X, csr_matrix) - assert isinstance(n_docs_distribution, numpy.ndarray) - assert isinstance(feature_index, int) - assert isinstance(sample_index, int) - pmi_score = self.pmi( X=X, n_docs_distribution=n_docs_distribution, diff --git a/DocumentFeatureSelection/pmi/pmi.pyx b/DocumentFeatureSelection/pmi/pmi.pyx new file mode 100644 index 0000000..5e72b6e --- /dev/null +++ b/DocumentFeatureSelection/pmi/pmi.pyx @@ -0,0 +1,49 @@ +import numpy +import math + +def pmi(X, + n_docs_distribution, + n_total_doc, + feature_index, + sample_index, verbose=False): + """get PMI score for given feature & sample index + + :param X: + :param feature_index: + :param sample_index: + :return: + """ + matrix_size = X.shape + sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] + + # n_11 is #docs having feature(i.e. word) in the specified index(label) + n_11 = X[sample_index, feature_index] + # n_01 is #docs NOT having feature in the specified index(label) + n_01 = n_docs_distribution[sample_index] - n_11 + # n_10 is #docs having feature in NOT specified index(indexes except specified index) + n_10 = X[sample_indexes, feature_index].sum() + # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) + n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) + + if verbose: + print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) + print('n_11:{} n_01:{} n_10:{} n_00:{}'.format( + n_11, + n_01, + n_10, + n_00 + )) + + if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: + return 0 + else: + temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) + temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) + temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) + temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) + score = temp1 + temp2 + temp3 + temp4 + + if score < 0: + raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.') + + return score \ No newline at end of file diff --git a/examples/example_python3.py b/examples/example_python3.py index 8837770..edbe7c6 100644 --- a/examples/example_python3.py +++ b/examples/example_python3.py @@ -35,7 +35,8 @@ input_dict=input_dict, method='pmi', ngram=1, - n_jobs=5 + n_jobs=1, + use_cython=True ) pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) diff --git a/setup.py b/setup.py index 6d56571..d0ef760 100644 --- a/setup.py +++ b/setup.py @@ -7,12 +7,15 @@ import sys from setuptools import setup, find_packages +from Cython.Build import cythonize +from distutils.extension import Extension +from Cython.Distutils import build_ext python_version = sys.version_info if python_version >= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', - 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc'] + 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython'] try: @@ -49,4 +52,6 @@ install_requires=install_requires, setup_requires=['six', 'setuptools>=1.0'], classifiers=[], + cmdclass={'build_ext': build_ext}, + ext_modules=[Extension("pmi", ["DocumentFeatureSelection/pmi/pmi.pyx"])] ) From c7a5edfa48daf90f4fb2c6d67d8bffafb60806f8 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Tue, 27 Sep 2016 10:11:24 +0900 Subject: [PATCH 05/12] used skleran for feature vectorising --- .../common/crs_matrix_constructor.py | 3 +- .../common/data_converter.py | 69 ++--------- .../common/labeledMultiDocs2labeledDocsSet.py | 116 ++++-------------- DocumentFeatureSelection/common/utils.py | 35 +++--- DocumentFeatureSelection/interface.py | 5 +- DocumentFeatureSelection/models.py | 18 +-- examples/example_python3.py | 28 ++--- setup.py | 2 +- 8 files changed, 81 insertions(+), 195 deletions(-) diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py index f599c7f..457112d 100644 --- a/DocumentFeatureSelection/common/crs_matrix_constructor.py +++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py @@ -4,8 +4,9 @@ import sys import logging import numpy -from typing import List, Tuple +from typing import List, Tuple, Dict from scipy.sparse import csr_matrix +from sklearn.feature_extraction import DictVectorizer logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index dc77b66..15c035a 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -68,11 +68,12 @@ def __check_data_structure(self, labeled_documents): return True - def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array): + + def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int]): """Count term-distribution per label. """ assert isinstance(labeled_documents, dict) - assert isinstance(label2id, numpy.ndarray) + assert isinstance(label2id, dict) # count total term-frequency per label term_frequency_distribution = { @@ -85,12 +86,14 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any] term_frequency_distribution_list = [0] * len(labeled_documents.keys()) for label_string, n_doc in term_frequency_distribution.items(): - term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + term_index = label2id[label_string] term_frequency_distribution_list[term_index] = n_doc return numpy.array(term_frequency_distribution_list, dtype='i8') - def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:numpy.core.multiarray.array)->numpy.ndarray: + + def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int])->numpy.ndarray: """This method count n(docs) per label. :param labeled_documents: @@ -98,7 +101,7 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab :return: """ assert isinstance(labeled_documents, dict) - assert isinstance(label2id, numpy.ndarray) + assert isinstance(label2id, dict) # count n(docs) per label n_doc_distribution = { @@ -111,7 +114,8 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab n_doc_distribution_list = [0] * len(labeled_documents.keys()) for label_string, n_doc in n_doc_distribution.items(): - docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] + docs_index = label2id[label_string] n_doc_distribution_list[docs_index] = n_doc return numpy.array(n_doc_distribution_list, dtype='i8') @@ -132,25 +136,6 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2TermFreqInfo(labeled_documents) - assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation) - logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature_frequency))) - if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000: - joblib_backend = 'threading' - if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000: - joblib_backend = 'multiprocessing' - - # make set of tuples to construct csr_matrix - row, col, data = crs_matrix_constructor.preprocess_csr_matrix( - feature_frequency=set_document_information.feature_frequency, - vocabulary=set_document_information.feature2id, - n_jobs=n_jobs, - joblib_backend=joblib_backend - ) - logger.debug(msg='Finished pre-processing before CSR matrix') - csr_matrix_ = crs_matrix_constructor.make_csr_objects( - row=row, col=col, data=data, - n_feature=len(set_document_information.feature2id), - n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label n_docs_distribution = self.count_document_distribution( @@ -163,12 +148,8 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, label2id=set_document_information.label2id ) - assert isinstance(csr_matrix_, csr_matrix) - assert isinstance(set_document_information.label2id, numpy.ndarray) - assert isinstance(set_document_information.label2id, numpy.ndarray) - assert isinstance(n_docs_distribution, numpy.ndarray) return DataCsrMatrix( - csr_matrix_, + set_document_information.matrix_object, set_document_information.label2id, set_document_information.feature2id, n_docs_distribution, term_frequency_distribution) @@ -231,27 +212,8 @@ def labeledMultiDocs2DocFreqMatrix(self, logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2DocFreqInfo(labeled_documents, - n_jobs=n_jobs, - joblib_backend=joblib_backend) + n_jobs=n_jobs) assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation) - logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id))) - if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000: - joblib_backend = 'threading' - if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000: - joblib_backend = 'multiprocessing' - - # make set of tuples to construct csr_matrix - row, col, data = crs_matrix_constructor.preprocess_csr_matrix( - feature_frequency=set_document_information.feature_frequency, - vocabulary=set_document_information.feature2id, - n_jobs=n_jobs, - joblib_backend=joblib_backend - ) - logger.debug(msg='Finished pre-processing before CSR matrix') - csr_matrix_ = crs_matrix_constructor.make_csr_objects( - row=row, col=col, data=data, - n_feature=len(set_document_information.feature2id), - n_docs=len(set_document_information.feature_frequency)) # count n(docs) per label n_docs_distribution = self.count_document_distribution( @@ -263,13 +225,8 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents=labeled_documents, label2id=set_document_information.label2id ) - - assert isinstance(csr_matrix_, csr_matrix) - assert isinstance(set_document_information.label2id, numpy.ndarray) - assert isinstance(set_document_information.feature2id, numpy.ndarray) - assert isinstance(n_docs_distribution, numpy.ndarray) return DataCsrMatrix( - csr_matrix_, + set_document_information.matrix_object, set_document_information.label2id, set_document_information.feature2id, n_docs_distribution, term_frequency_distribution) diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py index a178210..f490eaf 100644 --- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py +++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py @@ -3,14 +3,15 @@ from DocumentFeatureSelection.common import utils from DocumentFeatureSelection.models import SetDocumentInformation from DocumentFeatureSelection import init_logger +from sklearn.feature_extraction import DictVectorizer from typing import Dict, List, Tuple, Any, Union import logging import joblib -import numpy -import pickle +import itertools logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) N_FEATURE_SWITCH_STRATEGY = 1000000 + def decode_into_utf8(string:str)->bytes: """* what you can do - convert string into etf-8 @@ -37,52 +38,17 @@ def multiDocs2TermFreqInfo(labeled_documents): """ assert isinstance(labeled_documents, dict) - vocabulary_list = list(set(utils.flatten(labeled_documents.values()))) - vocabulary_list = sorted(vocabulary_list) - type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) - if type_flag == set(['str']): - feature_list = list(set(utils.flatten(labeled_documents.values()))) - feature_list = sorted(feature_list) - max_lenght = max([len(s) for s in feature_list]) - elif type_flag == set(['tuple']): - # make tuple into string - feature_list = list(set(utils.flatten(labeled_documents.values()))) - feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)] - max_lenght = max([len(s) for s in feature_list]) + 10 - else: - raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag)) - - feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: ndarray - - # make label: id dictionary structure - label2id_dict = {} - # make list of Term-Frequency - feature_frequency = [] - document_index = 0 - - for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0]): - label2id_dict.update({key: document_index}) - document_index += 1 - words_in_docs = utils.flatten(docs) - if type_flag == set(['str']): - term_freq = Counter(words_in_docs) - elif type_flag == set(['tuple']): - term_freq = {pickle.dumps(key): value for key,value in Counter(words_in_docs).items()} - else: - raise Exception() - - feature_frequency.append( - numpy.array( - [(index_tuple[0], index_tuple[1]) for index_tuple in term_freq.items()], - dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')] - )) - - label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10 - label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')]) - assert isinstance(feature2id, numpy.ndarray) - assert isinstance(feature_frequency, list) - assert isinstance(label2id, numpy.ndarray) - return SetDocumentInformation(feature_frequency, label2id, feature2id) + counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents)))) + for label, documents in labeled_documents.items()] + feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency] + + # use sklearn feature-extraction + vec = DictVectorizer() + matrix_object = vec.fit_transform(feature_documents).tocsr() + feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} + label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} + + return SetDocumentInformation(matrix_object, label2id, feature2id) def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str: @@ -100,7 +66,6 @@ def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str: def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple[Any]]]]], - joblib_backend:str='auto', n_jobs:int=1)->SetDocumentInformation: """This function generates information for constructing document-frequency matrix. """ @@ -108,50 +73,15 @@ def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) assert len(type_flag)==1 - if type_flag == set(['str']): - # all features are encoded into utf-8 - feature_list = [decode_into_utf8(str) for str in list(set(utils.flatten(labeled_documents.values())))] - feature_list = sorted(feature_list) - max_lenght = max([len(s) for s in feature_list]) - elif type_flag == set(['tuple']): - # feature tuples are serialized by pickle - feature_list = list(set(utils.flatten(labeled_documents.values()))) - feature_list = [pickle.dumps(feature_tuple) for feature_tuple in sorted(feature_list)] - max_lenght = max([len(s) for s in feature_list]) + 10 - else: - raise Exception('Your input data has various type of data. Detected types: {}'.format(type_flag)) - - feature2id = numpy.array(list(enumerate(feature_list)), dtype=[('value', 'i8'), ('key','S{}'.format(max_lenght))]) # type: ndarray - - # make label: id dictionary structure - label2id_dict = {} - # list of document-frequency array - feature_frequency = [] - - if joblib_backend == 'auto' and len(feature2id) >= N_FEATURE_SWITCH_STRATEGY: - joblib_backend = 'threading' - if joblib_backend == 'auto' and len(feature2id) < N_FEATURE_SWITCH_STRATEGY: - joblib_backend = 'multiprocessing' - - counted_frequency = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( + counted_frequency = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(generate_document_dict)(key, docs) for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0])) + feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency] + + # use sklearn feature-extraction + vec = DictVectorizer() + matrix_object = vec.fit_transform(feature_documents).tocsr() + feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} + label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} - document_index = 0 - for doc_key_freq_tuple in counted_frequency: - label2id_dict.update({doc_key_freq_tuple[0]: document_index}) - document_index += 1 - if type_flag == set(['str']): - doc_freq = {decode_into_utf8(key):value for key, value in doc_key_freq_tuple[1].items()} - elif type_flag == set(['tuple']): - doc_freq = {pickle.dumps(key): value for key,value in list(doc_key_freq_tuple[1].items())} - else: - raise Exception() - feature_frequency.append( - numpy.array( - list(doc_freq.items()), - dtype=[('key', 'S{}'.format(max_lenght)), ('value', 'i8')] - )) - label_max_length = max([len(label) for label in label2id_dict.keys()]) + 10 - label2id = numpy.array(list(label2id_dict.items()), dtype=[('key', 'S{}'.format(label_max_length)), ('value', 'i8')]) - return SetDocumentInformation(feature_frequency, label2id, feature2id) \ No newline at end of file + return SetDocumentInformation(matrix_object, label2id, feature2id) \ No newline at end of file diff --git a/DocumentFeatureSelection/common/utils.py b/DocumentFeatureSelection/common/utils.py index 02e3727..985eae3 100644 --- a/DocumentFeatureSelection/common/utils.py +++ b/DocumentFeatureSelection/common/utils.py @@ -83,28 +83,23 @@ def make_non_zero_information(weight_csr_matrix:csr_matrix): def get_label(row_col_val_tuple, label_id): assert isinstance(row_col_val_tuple, ROW_COL_VAL) - assert isinstance(label_id, numpy.ndarray) + #assert isinstance(label_id, numpy.ndarray) + assert isinstance(label_id, dict) - label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value'] - try: - original_label = pickle.loads(label) - except (pickle.UnpicklingError, KeyError): - original_label = label.decode('utf-8') + #label = label_id[numpy.where(label_id['key'] == row_col_val_tuple.row)][0]['value'] + label = label_id[row_col_val_tuple.row] - return original_label + return label def get_word(row_col_val_tuple, vocabulary): assert isinstance(row_col_val_tuple, ROW_COL_VAL) - assert isinstance(vocabulary, numpy.ndarray) - - vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value'] - try: - original_vocab = pickle.loads(vocab) - except (pickle.UnpicklingError, KeyError): - original_vocab = vocab.decode('utf-8') + #assert isinstance(vocabulary, numpy.ndarray) + assert isinstance(vocabulary, dict) + #vocab = vocabulary[numpy.where(vocabulary['key'] == row_col_val_tuple.col)][0]['value'] + vocab = vocabulary[row_col_val_tuple.col] - return original_vocab + return vocab def SUB_FUNC_feature_extraction(row_col_val_tuple:typing.Tuple[int,int,int], id2label:numpy.ndarray, id2vocab:numpy.ndarray): @@ -154,16 +149,18 @@ def get_feature_dictionary(weighted_matrix, vocabulary, label_group_dict, n_jobs :param bool cut_zero: return all result or not. If cut_zero = True, the method cuts zero features. """ assert isinstance(weighted_matrix, csr_matrix) - assert isinstance(vocabulary, numpy.ndarray) - assert isinstance(label_group_dict, numpy.ndarray) + assert isinstance(vocabulary, dict) + assert isinstance(label_group_dict, dict) assert isinstance(n_jobs, int) logger.debug(msg='Start making scored dictionary object from scored matrix') logger.debug(msg='Input matrix size= {} * {}'.format(weighted_matrix.shape[0], weighted_matrix.shape[1])) value_index_items = make_non_zero_information(weighted_matrix) - id2label = numpy.array([(element['value'], element['key']) for element in label_group_dict], dtype=[('key', '= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', - 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython'] + 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn'] try: From 1c24bb3ec76e0618b8aaa87f756d75b0bd12386e Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Tue, 27 Sep 2016 14:30:27 +0900 Subject: [PATCH 06/12] added cython code, but having problem in cthonize --- DocumentFeatureSelection/pmi/PMI_python3.py | 140 +++++++++++--------- DocumentFeatureSelection/pmi/pmi.pyx | 49 ------- DocumentFeatureSelection/pmi/pmi_cython.pyx | 71 ++++++++++ examples/check_performance.py | 23 +++- examples/example_python3.py | 14 +- setup.py | 8 +- 6 files changed, 185 insertions(+), 120 deletions(-) delete mode 100644 DocumentFeatureSelection/pmi/pmi.pyx create mode 100644 DocumentFeatureSelection/pmi/pmi_cython.pyx diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py index 96f0472..8932f34 100644 --- a/DocumentFeatureSelection/pmi/PMI_python3.py +++ b/DocumentFeatureSelection/pmi/PMI_python3.py @@ -24,6 +24,60 @@ # TODO normzalized pmiの導入 # http://sucrose.hatenablog.com/entry/2014/12/02/235959 + +def pmi(X:csr_matrix, + n_docs_distribution:numpy.ndarray, + n_total_doc:int, + feature_index:int, + sample_index:int, verbose=False): + """get PMI score for given feature & sample index + + :param X: + :param feature_index: + :param sample_index: + :return: + """ + assert isinstance(X, csr_matrix) + assert isinstance(n_docs_distribution, numpy.ndarray) + assert isinstance(feature_index, int) + assert isinstance(sample_index, int) + + matrix_size = X.shape + sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] + + # n_11 is #docs having feature(i.e. word) in the specified index(label) + n_11 = X[sample_index, feature_index] + # n_01 is #docs NOT having feature in the specified index(label) + n_01 = n_docs_distribution[sample_index] - n_11 + # n_10 is #docs having feature in NOT specified index(indexes except specified index) + n_10 = X[sample_indexes, feature_index].sum() + # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) + n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) + + if verbose: + logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) + logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format( + n_11, + n_01, + n_10, + n_00 + )) + + if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: + return 0 + else: + temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) + temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) + temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) + temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) + score = temp1 + temp2 + temp3 + temp4 + + if score < 0: + raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.') + + return score + + class PMI(object): def __init__(self): pass @@ -49,23 +103,29 @@ def fit_transform(self, X, if use_cython: import pyximport; pyximport.install() - from DocumentFeatureSelection.pmi import pmi - self.pmi = pmi + from DocumentFeatureSelection.pmi.pmi_cython import main + logger.warning(msg='n_jobs parameter is invalid when use_cython=True') + print(X.toarray()) + pmi_score_csr_source = main(X=X, + n_docs_distribution=n_docs_distribution, + sample_range=sample_range, + feature_range=feature_range, + n_total_doc=n_total_document) + else: - self.pmi = self.pmi - - pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( - joblib.delayed(self.docId_word_PMI)( - X=X, - n_docs_distribution=n_docs_distribution, - feature_index=feature_index, - sample_index=sample_index, - n_total_doc=n_total_document, - verbose=verbose + self.pmi = pmi + pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( + joblib.delayed(self.docId_word_PMI)( + X=X, + n_docs_distribution=n_docs_distribution, + feature_index=feature_index, + sample_index=sample_index, + n_total_doc=n_total_document, + verbose=verbose + ) + for sample_index in sample_range + for feature_index in feature_range ) - for sample_index in sample_range - for feature_index in feature_range - ) row_list = [t[0] for t in pmi_score_csr_source] col_list = [t[1] for t in pmi_score_csr_source] @@ -105,54 +165,4 @@ def docId_word_PMI(self, X:csr_matrix, ) return sample_index, feature_index, pmi_score - def pmi(self, X:csr_matrix, - n_docs_distribution:numpy.ndarray, - n_total_doc:int, - feature_index:int, - sample_index:int, verbose=False): - """get PMI score for given feature & sample index - - :param X: - :param feature_index: - :param sample_index: - :return: - """ - assert isinstance(X, csr_matrix) - assert isinstance(n_docs_distribution, numpy.ndarray) - assert isinstance(feature_index, int) - assert isinstance(sample_index, int) - - matrix_size = X.shape - sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] - - # n_11 is #docs having feature(i.e. word) in the specified index(label) - n_11 = X[sample_index, feature_index] - # n_01 is #docs NOT having feature in the specified index(label) - n_01 = n_docs_distribution[sample_index] - n_11 - # n_10 is #docs having feature in NOT specified index(indexes except specified index) - n_10 = X[sample_indexes, feature_index].sum() - # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) - n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) - - if verbose: - logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) - logging.debug('n_11:{} n_01:{} n_10:{} n_00:{}'.format( - n_11, - n_01, - n_10, - n_00 - )) - - if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: - return 0 - else: - temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) - temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) - temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) - temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) - score = temp1 + temp2 + temp3 + temp4 - - if score < 0: - raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.') - return score diff --git a/DocumentFeatureSelection/pmi/pmi.pyx b/DocumentFeatureSelection/pmi/pmi.pyx deleted file mode 100644 index 5e72b6e..0000000 --- a/DocumentFeatureSelection/pmi/pmi.pyx +++ /dev/null @@ -1,49 +0,0 @@ -import numpy -import math - -def pmi(X, - n_docs_distribution, - n_total_doc, - feature_index, - sample_index, verbose=False): - """get PMI score for given feature & sample index - - :param X: - :param feature_index: - :param sample_index: - :return: - """ - matrix_size = X.shape - sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] - - # n_11 is #docs having feature(i.e. word) in the specified index(label) - n_11 = X[sample_index, feature_index] - # n_01 is #docs NOT having feature in the specified index(label) - n_01 = n_docs_distribution[sample_index] - n_11 - # n_10 is #docs having feature in NOT specified index(indexes except specified index) - n_10 = X[sample_indexes, feature_index].sum() - # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) - n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) - - if verbose: - print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) - print('n_11:{} n_01:{} n_10:{} n_00:{}'.format( - n_11, - n_01, - n_10, - n_00 - )) - - if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: - return 0 - else: - temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) - temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) - temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) - temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) - score = temp1 + temp2 + temp3 + temp4 - - if score < 0: - raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.') - - return score \ No newline at end of file diff --git a/DocumentFeatureSelection/pmi/pmi_cython.pyx b/DocumentFeatureSelection/pmi/pmi_cython.pyx new file mode 100644 index 0000000..5acbde5 --- /dev/null +++ b/DocumentFeatureSelection/pmi/pmi_cython.pyx @@ -0,0 +1,71 @@ +import math +import scipy +cimport numpy as np + +cdef float pmi(np.ndarray[np.float64_t, ndim=2] X, + int n_samples, + np.ndarray[np.int64_t, ndim=1] n_docs_distribution, + int n_total_doc, + int feature_index, + int sample_index): + """get PMI score for given feature & sample index + """ + cdef i + sample_indexes = [i for i in range(0, n_samples) if i != sample_index] + + # n_11 is #docs having feature(i.e. word) in the specified index(label) + cdef float n_11 = X[sample_index, feature_index] + # n_01 is #docs NOT having feature in the specified index(label) + cdef float n_01 = n_docs_distribution[sample_index] - n_11 + # n_10 is #docs having feature in NOT specified index(indexes except specified index) + cdef float n_10 = X[sample_indexes, feature_index].sum() + # n_00 is #docs NOT having feature in NOT specified index(indexes except specified index) + cdef float n_00 = n_total_doc - (n_10 + n_docs_distribution[sample_index]) + + cdef float temp1, temp2, temp3, temp4, score + + if n_11 == 0.0 or n_10 == 0.0 or n_01 == 0.0 or n_00 == 0.0: + return 0 + else: + temp1 = n_11/n_total_doc * math.log((n_total_doc*n_11)/((n_10+n_11)*(n_01+n_11)), 2) + temp2 = n_01/n_total_doc * math.log((n_total_doc*n_01)/((n_00+n_01)*(n_01+n_11)), 2) + temp3 = n_10/n_total_doc * math.log((n_total_doc*n_10)/((n_10+n_11)*(n_00+n_10)), 2) + temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) + score = temp1 + temp2 + temp3 + temp4 + + if score < 0: + print(score) + raise Exception('PMI score={}. Score under 0 is detected. Something strange in Input matrix. Check your input matrix.'.format(score)) + + return score + + +def main(X, + np.ndarray[np.int64_t, ndim=1] n_docs_distribution, + int n_total_doc, + sample_range, + feature_range): + """What you can do + - calculate PMI score based on given data. + - The function returns list of tuple, whose element is (sample_index, feature_index, score) + - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature. + """ + + cdef int n_samples = X.shape[0] + + if isinstance(X, scipy.sparse.csr_matrix): + X = X.toarray() + + cdef int sample_index, feature_index + pmi_score_csr_source = [ + ( + sample_index, + feature_index, + pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index) + ) + for sample_index in sample_range + for feature_index in feature_range + ] + non_zero_pmi_score_csr_source = [score_tuple for score_tuple in pmi_score_csr_source if not score_tuple[2]==0] + + return non_zero_pmi_score_csr_source \ No newline at end of file diff --git a/examples/check_performance.py b/examples/check_performance.py index c45cd00..b254aeb 100644 --- a/examples/check_performance.py +++ b/examples/check_performance.py @@ -1,6 +1,7 @@ from DocumentFeatureSelection import interface import nltk import logging +import time try: import line_profiler except: @@ -12,16 +13,20 @@ #@profile def pmi_with_parallel(input_corpus): logging.debug(msg='With multiprocessing backend') + start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, joblib_backend='multiprocessing' ) + elapsed_time = time.time() - start + print ("elapsed_time with multiprocess:{}".format(elapsed_time)) + "[sec]" #@profile def pmi_with_threading(input_corpus): + start = time.time() logging.debug(msg='With threading backend') scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, @@ -29,6 +34,21 @@ def pmi_with_threading(input_corpus): n_jobs=-1, joblib_backend='threading' ) + elapsed_time = time.time() - start + print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]" + + +def pmi_with_cython(input_corpus): + logging.debug(msg='With cython is True') + start = time.time() + scored_matrix_obj = interface.run_feature_selection( + input_dict=input_corpus, + method='pmi', + n_jobs=-1, + use_cython=True + ) + elapsed_time = time.time() - start + print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]" from nltk.corpus import gutenberg from nltk.corpus import webtext @@ -47,5 +67,6 @@ def pmi_with_threading(input_corpus): 'gutenberg': list(gutenberg_corpus) } +pmi_with_cython(input_corpus) pmi_with_parallel(input_corpus) -pmi_with_threading(input_corpus) \ No newline at end of file +pmi_with_threading(input_corpus) diff --git a/examples/example_python3.py b/examples/example_python3.py index 800c90b..19241c7 100644 --- a/examples/example_python3.py +++ b/examples/example_python3.py @@ -36,8 +36,7 @@ input_dict=input_dict, method='tf_idf', ngram=1, - n_jobs=5, - use_cython=False + n_jobs=5 ) pmi_scored_object = interface.run_feature_selection( @@ -49,6 +48,17 @@ ) pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) +# you can use cython version pmi also +# !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place. +pmi_scored_object_cython = interface.run_feature_selection( + input_dict=input_dict, + method='pmi', + ngram=1, + n_jobs=1, + use_cython=True +) +pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) + soa_scored_object = interface.run_feature_selection( input_dict=input_dict, diff --git a/setup.py b/setup.py index 5d03ace..4431201 100644 --- a/setup.py +++ b/setup.py @@ -7,17 +7,18 @@ import sys from setuptools import setup, find_packages +import numpy from Cython.Build import cythonize from distutils.extension import Extension from Cython.Distutils import build_ext + python_version = sys.version_info if python_version >= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn'] - try: import pypandoc long_description = pypandoc.convert('README.md', 'rst') @@ -53,5 +54,6 @@ setup_requires=['six', 'setuptools>=1.0'], classifiers=[], cmdclass={'build_ext': build_ext}, - ext_modules=[Extension("pmi", ["DocumentFeatureSelection/pmi/pmi.pyx"])] -) + ext_modules=cythonize("DocumentFeatureSelection/pmi/pmi_cython.pyx"), + include_dirs = [numpy.get_include()] +) \ No newline at end of file From a122dc32d147f456c942e8c8af1aa4ef94de316c Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 28 Sep 2016 09:37:13 +0900 Subject: [PATCH 07/12] added relative compile system of cython codes --- setup.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 4431201..a023c26 100644 --- a/setup.py +++ b/setup.py @@ -7,10 +7,31 @@ import sys from setuptools import setup, find_packages -import numpy -from Cython.Build import cythonize -from distutils.extension import Extension -from Cython.Distutils import build_ext + +# Flags to compile Cython code or use already compiled code +# -------------------------------------------------------------------------------------------------------- +try: + from Cython.Build import cythonize + from distutils.extension import Extension + from Cython.Distutils import build_ext + import numpy +except ImportError: + use_cython = False +else: + use_cython = True + +cmdclass = { } +ext_modules = [ ] +if use_cython: + ext_modules += [ + Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ]), + ] + cmdclass.update({ 'build_ext': build_ext }) +else: + ext_modules += [ + Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.c" ]), + ] +# -------------------------------------------------------------------------------------------------------- python_version = sys.version_info @@ -18,6 +39,8 @@ if python_version >= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn'] +else: + raise Exception('This package does NOT support Python2.x') try: import pypandoc @@ -53,7 +76,7 @@ install_requires=install_requires, setup_requires=['six', 'setuptools>=1.0'], classifiers=[], - cmdclass={'build_ext': build_ext}, - ext_modules=cythonize("DocumentFeatureSelection/pmi/pmi_cython.pyx"), + cmdclass=cmdclass, + ext_modules=ext_modules, include_dirs = [numpy.get_include()] ) \ No newline at end of file From 3d541d565ce848ea03c2dd366f4a5c31fd072e5d Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 28 Sep 2016 11:25:06 +0900 Subject: [PATCH 08/12] added cython on soa --- DocumentFeatureSelection/interface.py | 3 +- DocumentFeatureSelection/pmi/PMI_python3.py | 7 +- DocumentFeatureSelection/pmi/pmi_cython.pyx | 14 ++- DocumentFeatureSelection/soa/soa_cython.pyx | 76 ++++++++++++ DocumentFeatureSelection/soa/soa_python3.py | 126 +++++++++++--------- examples/check_performance.py | 10 +- examples/example_python3.py | 55 ++++++++- setup.py | 3 +- 8 files changed, 214 insertions(+), 80 deletions(-) create mode 100644 DocumentFeatureSelection/soa/soa_cython.pyx diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py index 3889208..0673960 100644 --- a/DocumentFeatureSelection/interface.py +++ b/DocumentFeatureSelection/interface.py @@ -66,7 +66,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] scored_sparse_matrix = SOA().fit_transform(X=matrix_data_object.csr_matrix_, unit_distribution=matrix_data_object.n_docs_distribution, n_jobs=n_jobs, - joblib_backend=backend_strategy) + joblib_backend=backend_strategy, + use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) elif method == 'soa' and matrix_form == 'term_freq': diff --git a/DocumentFeatureSelection/pmi/PMI_python3.py b/DocumentFeatureSelection/pmi/PMI_python3.py index 8932f34..92d0006 100644 --- a/DocumentFeatureSelection/pmi/PMI_python3.py +++ b/DocumentFeatureSelection/pmi/PMI_python3.py @@ -72,9 +72,6 @@ def pmi(X:csr_matrix, temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) score = temp1 + temp2 + temp3 + temp4 - if score < 0: - raise Exception('score under 0 is detected. Something strange in Input matrix. Check your input matrix.') - return score @@ -105,12 +102,12 @@ def fit_transform(self, X, import pyximport; pyximport.install() from DocumentFeatureSelection.pmi.pmi_cython import main logger.warning(msg='n_jobs parameter is invalid when use_cython=True') - print(X.toarray()) pmi_score_csr_source = main(X=X, n_docs_distribution=n_docs_distribution, sample_range=sample_range, feature_range=feature_range, - n_total_doc=n_total_document) + n_total_doc=n_total_document, + verbose=False) else: self.pmi = pmi diff --git a/DocumentFeatureSelection/pmi/pmi_cython.pyx b/DocumentFeatureSelection/pmi/pmi_cython.pyx index 5acbde5..856ee6c 100644 --- a/DocumentFeatureSelection/pmi/pmi_cython.pyx +++ b/DocumentFeatureSelection/pmi/pmi_cython.pyx @@ -1,13 +1,15 @@ import math import scipy cimport numpy as np +from cpython cimport bool cdef float pmi(np.ndarray[np.float64_t, ndim=2] X, int n_samples, np.ndarray[np.int64_t, ndim=1] n_docs_distribution, int n_total_doc, int feature_index, - int sample_index): + int sample_index, + bool verbose): """get PMI score for given feature & sample index """ cdef i @@ -33,9 +35,8 @@ cdef float pmi(np.ndarray[np.float64_t, ndim=2] X, temp4 = n_00/n_total_doc * math.log((n_total_doc*n_00)/((n_00+n_01)*(n_00+n_10)), 2) score = temp1 + temp2 + temp3 + temp4 - if score < 0: - print(score) - raise Exception('PMI score={}. Score under 0 is detected. Something strange in Input matrix. Check your input matrix.'.format(score)) + if verbose: + print('score={}, temp1={}, temp2={}, temp3={}, temp4={}, n11={}, n10={}, n01={}, n00={}, n_total_docs={}'.format(score, temp1, temp2, temp3, temp4, n_11, n_10, n_01, n_00, n_total_doc)) return score @@ -44,7 +45,8 @@ def main(X, np.ndarray[np.int64_t, ndim=1] n_docs_distribution, int n_total_doc, sample_range, - feature_range): + feature_range, + bool verbose=False): """What you can do - calculate PMI score based on given data. - The function returns list of tuple, whose element is (sample_index, feature_index, score) @@ -61,7 +63,7 @@ def main(X, ( sample_index, feature_index, - pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index) + pmi(X, n_samples, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose) ) for sample_index in sample_range for feature_index in feature_range diff --git a/DocumentFeatureSelection/soa/soa_cython.pyx b/DocumentFeatureSelection/soa/soa_cython.pyx new file mode 100644 index 0000000..19f4635 --- /dev/null +++ b/DocumentFeatureSelection/soa/soa_cython.pyx @@ -0,0 +1,76 @@ +import math +import scipy +cimport numpy as np +from cpython cimport bool + +cdef float soa( + np.ndarray[np.float64_t, ndim=2] X, + np.ndarray[np.int64_t, ndim=1] unit_distribution, + int n_total_docs, + int feature_index, + int sample_index, + bool verbose): + # X is either of term-frequency matrix per label or document-frequency per label + + matrix_size = X.shape + NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] + + # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e + cdef float freq_w_e = X[sample_index, feature_index] + # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e + cdef float freq_w_not_e = X[NOT_sample_indexes, feature_index].sum() + # freq_e is the number of the unit having specific label e + cdef float freq_e = unit_distribution[sample_index] + # freq_not_e is the number of the unit NOT having the specific label e + cdef float freq_not_e = n_total_docs - freq_e + cdef float nominator, denominator, ans, soa_val + + if verbose: + print('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) + print('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format( + freq_w_e, + freq_w_not_e, + freq_e, + freq_not_e + )) + + if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0: + return 0.0 + else: + nominator = (float(freq_w_e) * freq_not_e) + denominator = (float(freq_e) * freq_w_not_e) + ans = nominator / denominator + soa_val = math.log(ans, 2) + return soa_val + + +def main(X, + np.ndarray[np.int64_t, ndim=1] n_docs_distribution, + int n_total_doc, + sample_range, + feature_range, + bool verbose=False): + """What you can do + - calculate PMI score based on given data. + - The function returns list of tuple, whose element is (sample_index, feature_index, score) + - Your input matrix should be numpy.ndarray or scipy.sparse.csr_matrix. The matrix should represent document-frequency of each feature. + """ + + cdef int n_samples = X.shape[0] + + if isinstance(X, scipy.sparse.csr_matrix): + X = X.toarray() + + cdef int sample_index, feature_index + soa_score_csr_source = [ + ( + sample_index, + feature_index, + soa(X, n_samples, n_docs_distribution, feature_index, sample_index, verbose) + ) + for sample_index in sample_range + for feature_index in feature_range + ] + non_zero_soa_score_csr_source = [score_tuple for score_tuple in soa_score_csr_source if not score_tuple[2]==0] + + return non_zero_soa_score_csr_source \ No newline at end of file diff --git a/DocumentFeatureSelection/soa/soa_python3.py b/DocumentFeatureSelection/soa/soa_python3.py index 82776b0..1dddc7a 100644 --- a/DocumentFeatureSelection/soa/soa_python3.py +++ b/DocumentFeatureSelection/soa/soa_python3.py @@ -15,11 +15,54 @@ __author__ = 'kensuke-mi' +def soa(X:csr_matrix, unit_distribution:numpy.ndarray, + n_total_docs:int, + feature_index:int, + sample_index:int, verbose=False): + # X is either of term-frequency matrix per label or document-frequency per label + assert isinstance(X, csr_matrix) + assert isinstance(unit_distribution, numpy.ndarray) + assert isinstance(feature_index, int) + assert isinstance(sample_index, int) + + matrix_size = X.shape + NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] + + # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e + freq_w_e = X[sample_index, feature_index] + # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e + freq_w_not_e = X[NOT_sample_indexes, feature_index].sum() + # freq_e is the number of the unit having specific label e + freq_e = unit_distribution[sample_index] + # freq_not_e is the number of the unit NOT having the specific label e + freq_not_e = n_total_docs - freq_e + + if verbose: + logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) + logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format( + freq_w_e, + freq_w_not_e, + freq_e, + freq_not_e + )) + + if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0: + return 0 + else: + nominator = (float(freq_w_e) * freq_not_e) + denominator = (float(freq_e) * freq_w_not_e) + ans = nominator / denominator + assert isinstance(ans, float) + soa_val = math.log(ans, 2) + return soa_val + + class SOA(object): def __init__(self): pass - def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, joblib_backend='multiprocessing'): + def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=False, + joblib_backend='multiprocessing', use_cython:bool=False): assert isinstance(X, csr_matrix) assert isinstance(unit_distribution, numpy.ndarray) @@ -31,22 +74,34 @@ def fit_transform(self, X, unit_distribution:numpy.ndarray, n_jobs=1, verbose=Fa logger.debug(msg='Start calculating SOA with n(process)={}'.format(n_jobs)) logger.debug(msg='size(input_matrix)={} * {}'.format(X.shape[0], X.shape[1])) - pmi_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( - joblib.delayed(self.docId_word_soa)( - X=X, - unit_distribution=unit_distribution, - feature_index=feature_index, - sample_index=sample_index, - n_total_doc=n_total_document, - verbose=verbose + if use_cython: + import pyximport; pyximport.install() + from DocumentFeatureSelection.soa.soa_cython import main + logger.warning(msg='n_jobs parameter is invalid when use_cython=True') + soa_score_csr_source = main(X=X, + n_docs_distribution=unit_distribution, + n_total_doc=n_total_document, + sample_range=sample_range, + feature_range=feature_range, + verbose=False) + else: + self.soa = soa + soa_score_csr_source = joblib.Parallel(n_jobs=n_jobs, backend=joblib_backend)( + joblib.delayed(self.docId_word_soa)( + X=X, + unit_distribution=unit_distribution, + feature_index=feature_index, + sample_index=sample_index, + n_total_doc=n_total_document, + verbose=verbose + ) + for sample_index in sample_range + for feature_index in feature_range ) - for sample_index in sample_range - for feature_index in feature_range - ) - row_list = [t[0] for t in pmi_score_csr_source] - col_list = [t[1] for t in pmi_score_csr_source] - data_list = [t[2] for t in pmi_score_csr_source] + row_list = [t[0] for t in soa_score_csr_source] + col_list = [t[1] for t in soa_score_csr_source] + data_list = [t[2] for t in soa_score_csr_source] soa_featured_csr_matrix = csr_matrix((data_list, (row_list, col_list)), shape=(X.shape[0], @@ -77,44 +132,3 @@ def docId_word_soa(self, X:csr_matrix, unit_distribution:numpy.ndarray, verbose=verbose ) return sample_index, feature_index, soa_score - - def soa(self, X:csr_matrix, unit_distribution:numpy.ndarray, - n_total_docs:int, - feature_index:int, - sample_index:int, verbose=False): - # X is either of term-frequency matrix per label or document-frequency per label - assert isinstance(X, csr_matrix) - assert isinstance(unit_distribution, numpy.ndarray) - assert isinstance(feature_index, int) - assert isinstance(sample_index, int) - - matrix_size = X.shape - NOT_sample_indexes = [i for i in range(0, matrix_size[0]) if i != sample_index] - - # freq_w_e is term-frequency(or document-frequency) of w in the unit having the specific label e - freq_w_e = X[sample_index, feature_index] - # freq_w_not_e is term-frequency(or document-frequency) of w in units except the specific label e - freq_w_not_e = X[NOT_sample_indexes, feature_index].sum() - # freq_e is the number of the unit having specific label e - freq_e = unit_distribution[sample_index] - # freq_not_e is the number of the unit NOT having the specific label e - freq_not_e = n_total_docs - freq_e - - if verbose: - logging.debug('For feature_index:{} sample_index:{}'.format(feature_index, sample_index)) - logging.debug('freq_w_e:{} freq_w_not_e:{} freq_e:{} freq_not_e:{}'.format( - freq_w_e, - freq_w_not_e, - freq_e, - freq_not_e - )) - - if freq_w_e == 0 or freq_w_not_e == 0 or freq_e == 0 or freq_not_e == 0: - return 0 - else: - nominator = (float(freq_w_e) * freq_not_e) - denominator = (float(freq_e) * freq_w_not_e) - ans = nominator / denominator - assert isinstance(ans, float) - soa_val = math.log(ans, 2) - return soa_val \ No newline at end of file diff --git a/examples/check_performance.py b/examples/check_performance.py index b254aeb..c297937 100644 --- a/examples/check_performance.py +++ b/examples/check_performance.py @@ -21,7 +21,7 @@ def pmi_with_parallel(input_corpus): joblib_backend='multiprocessing' ) elapsed_time = time.time() - start - print ("elapsed_time with multiprocess:{}".format(elapsed_time)) + "[sec]" + print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)) #@profile @@ -35,7 +35,7 @@ def pmi_with_threading(input_corpus): joblib_backend='threading' ) elapsed_time = time.time() - start - print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]" + print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)) def pmi_with_cython(input_corpus): @@ -48,7 +48,7 @@ def pmi_with_cython(input_corpus): use_cython=True ) elapsed_time = time.time() - start - print ("elapsed_time with multithreading:{}".format(elapsed_time)) + "[sec]" + print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)) from nltk.corpus import gutenberg from nltk.corpus import webtext @@ -68,5 +68,5 @@ def pmi_with_cython(input_corpus): } pmi_with_cython(input_corpus) -pmi_with_parallel(input_corpus) -pmi_with_threading(input_corpus) +#pmi_with_parallel(input_corpus) +#pmi_with_threading(input_corpus) diff --git a/examples/example_python3.py b/examples/example_python3.py index 19241c7..7cde041 100644 --- a/examples/example_python3.py +++ b/examples/example_python3.py @@ -5,7 +5,7 @@ import logging import pprint logger = logging.getLogger('sample usage') -logger.level = logging.DEBUG +logger.level = logging.ERROR # ====================================================================================================== @@ -31,6 +31,8 @@ ] } +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# tf idf tf_idf_scored_object = interface.run_feature_selection( input_dict=input_dict, @@ -39,6 +41,8 @@ n_jobs=5 ) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# pmi pmi_scored_object = interface.run_feature_selection( input_dict=input_dict, method='pmi', @@ -59,7 +63,8 @@ ) pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) - +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# soa soa_scored_object = interface.run_feature_selection( input_dict=input_dict, method='soa', @@ -68,7 +73,18 @@ ) pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) +soa_scored_object_cython = interface.run_feature_selection( + input_dict=input_dict, + method='soa', + ngram=1, + n_jobs=1, + use_cython=True +) +pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# bns input_dict = { "positive": [ ["I", "aa", "aa", "aa", "aa", "aa"], @@ -108,7 +124,18 @@ ] } +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# tf idf +tf_idf_scored_object = interface.run_feature_selection( + input_dict=input_dict_tuple_feature, + method='tf_idf', + n_jobs=5 +) +pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# pmi pmi_scored_object = interface.run_feature_selection( input_dict=input_dict_tuple_feature, method='pmi', @@ -117,6 +144,17 @@ pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) +pmi_scored_object_cython = interface.run_feature_selection( + input_dict=input_dict_tuple_feature, + method='pmi', + n_jobs=1, + use_cython=True +) +pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# soa soa_scored_object = interface.run_feature_selection( input_dict=input_dict_tuple_feature, method='soa', @@ -125,14 +163,19 @@ pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) -tf_idf_scored_object = interface.run_feature_selection( +soa_scored_object_cython = interface.run_feature_selection( input_dict=input_dict_tuple_feature, - method='tf_idf', - n_jobs=5 + method='soa', + n_jobs=1, + use_cython=True ) -pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) +pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# bns input_dict_tuple_feature = { "positive": [ [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ], diff --git a/setup.py b/setup.py index a023c26..2e6fef7 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,8 @@ ext_modules = [ ] if use_cython: ext_modules += [ - Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ]), + Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.pyx" ],), + Extension("DocumentFeatureSelection.soa.soa_cython", [ "DocumentFeatureSelection/soa/soa_cython.pyx" ],) ] cmdclass.update({ 'build_ext': build_ext }) else: From c806a24199b4bf9dd60f470cee6886dc68bacc6f Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 28 Sep 2016 11:28:47 +0900 Subject: [PATCH 09/12] updated README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fc1a476..f4fe714 100644 --- a/README.md +++ b/README.md @@ -152,5 +152,9 @@ Removed a bug when calling n_gram method of DataConverter * A bug in calculating TF-IDF score, this bug was resolved. -## 1.3 2016/9/ +## 1.3 2016/9/28 + +* Resolved bottleneck poins in pre-processing + * Introduced dict-vectorising in ScikitLearn + * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/example_python3.py` From 28e2ae0d61ec290f3d79e3a3d7d7db02ff5bb560 Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 28 Sep 2016 12:35:34 +0900 Subject: [PATCH 10/12] resolved a bug in soa_cython.pyx --- DocumentFeatureSelection/soa/soa_cython.pyx | 2 +- examples/check_performance.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/DocumentFeatureSelection/soa/soa_cython.pyx b/DocumentFeatureSelection/soa/soa_cython.pyx index 19f4635..7ab7a97 100644 --- a/DocumentFeatureSelection/soa/soa_cython.pyx +++ b/DocumentFeatureSelection/soa/soa_cython.pyx @@ -66,7 +66,7 @@ def main(X, ( sample_index, feature_index, - soa(X, n_samples, n_docs_distribution, feature_index, sample_index, verbose) + soa(X, n_docs_distribution, n_total_doc, feature_index, sample_index, verbose) ) for sample_index in sample_range for feature_index in feature_range diff --git a/examples/check_performance.py b/examples/check_performance.py index c297937..999093b 100644 --- a/examples/check_performance.py +++ b/examples/check_performance.py @@ -48,7 +48,7 @@ def pmi_with_cython(input_corpus): use_cython=True ) elapsed_time = time.time() - start - print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)) + print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) from nltk.corpus import gutenberg from nltk.corpus import webtext @@ -68,5 +68,5 @@ def pmi_with_cython(input_corpus): } pmi_with_cython(input_corpus) -#pmi_with_parallel(input_corpus) +pmi_with_parallel(input_corpus) #pmi_with_threading(input_corpus) From 91805e48edd4659b31b16d373e467e23d840ea3f Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 5 Oct 2016 10:04:10 +0900 Subject: [PATCH 11/12] updated setup.py & discription in README --- README.md | 22 ++++++++++++++++++---- setup.py | 36 +++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index f4fe714..c957f55 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,6 @@ or ```George Forman, "An Extensive Empirical Study of Feature Selection Metrics for Text Classification",Journal of Machine Learning Research 3 (2003) 1289-1305``` - - - # Requirement @@ -107,6 +104,21 @@ or `python setup.py install` +### Note + +You might see error message during running this command, such as + +``` +We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution. +``` + +This is because `setup.py` tries to instal numpy and scipy with `pip`, however it fails. +We need numpy and scipy before we install `scikit-learn`. + +In this case, you take following choice + +* You install `numpy` and `scipy` manually +* You use `anaconda` python distribution. Please visit [their site](https://www.continuum.io/downloads). # Examples @@ -157,4 +169,6 @@ Removed a bug when calling n_gram method of DataConverter * Resolved bottleneck poins in pre-processing * Introduced dict-vectorising in ScikitLearn * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/example_python3.py` - +* Performance + * Cython PMI takes 11.87 sec. + * Python multiprocessing PMI takes 513.541 sec. (8.55 min.) \ No newline at end of file diff --git a/setup.py b/setup.py index 2e6fef7..b3e21f9 100644 --- a/setup.py +++ b/setup.py @@ -6,15 +6,16 @@ __version__ = '1.3' import sys +import pip from setuptools import setup, find_packages +from distutils.extension import Extension + -# Flags to compile Cython code or use already compiled code # -------------------------------------------------------------------------------------------------------- +# Flags to compile Cython code or use already compiled code try: from Cython.Build import cythonize - from distutils.extension import Extension from Cython.Distutils import build_ext - import numpy except ImportError: use_cython = False else: @@ -32,14 +33,34 @@ ext_modules += [ Extension("DocumentFeatureSelection.pmi.pmi_cython", [ "DocumentFeatureSelection/pmi/pmi_cython.c" ]), ] -# -------------------------------------------------------------------------------------------------------- +# -------------------------------------------------------------------------------------------------------- +# try to install numpy automatically because sklearn requires the status where numpy is already installed +try: + import numpy +except ImportError: + use_numpy_include_dirs = False + try: + pip.main(['install', 'numpy']) + except: + raise Exception('We failed to install numpy automatically. Try installing numpy manually or Try anaconda distribution.') +# -------------------------------------------------------------------------------------------------------- +# try to install scipy automatically because sklearn requires the status where scipy is already installed +try: + import scipy +except ImportError: + use_numpy_include_dirs = False + try: + pip.main(['install', 'scipy']) + except: + raise Exception('We failed to install scipy automatically. Try installing scipy manually or Try anaconda distribution.') +# -------------------------------------------------------------------------------------------------------- python_version = sys.version_info if python_version >= (3, 0, 0): - install_requires = ['six', 'setuptools>=1.0', 'joblib', - 'scipy', 'nltk', 'scikit-learn', 'numpy', 'pypandoc', 'cython', 'scikit-learn'] + install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy', + 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython'] else: raise Exception('This package does NOT support Python2.x') @@ -75,7 +96,8 @@ zip_safe=False, test_suite='tests.all_tests.suite', install_requires=install_requires, - setup_requires=['six', 'setuptools>=1.0'], + tests_require=install_requires, + setup_requires=['six', 'setuptools>=1.0', 'pip'], classifiers=[], cmdclass=cmdclass, ext_modules=ext_modules, From 6912bd23d58f2ea859c794aa6c48883bfdb60eab Mon Sep 17 00:00:00 2001 From: Kensuke-Mitsuzawa Date: Wed, 5 Oct 2016 10:05:25 +0900 Subject: [PATCH 12/12] updated version tag --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b3e21f9..4a91f12 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ """ __author__ = 'kensuke-mi' -__version__ = '1.3' +__version__ = '1.3.1' import sys import pip