Skip to content

Commit

Permalink
Merge pull request #16 from Kensuke-Mitsuzawa/14_sk
Browse files Browse the repository at this point in the history
14 sk
  • Loading branch information
Kensuke-Mitsuzawa authored Oct 5, 2016
2 parents f00deaf + 6912bd2 commit 03c74fc
Show file tree
Hide file tree
Showing 17 changed files with 688 additions and 326 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ __pycache__/

# C extensions
*.so
*.c

# Distribution / packaging
.Python
Expand Down
16 changes: 13 additions & 3 deletions DocumentFeatureSelection/bns/bns_python3.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __check_matrix_form(self, X):
if n_categories != 2:
raise Exception('BNS input must be of 2 categories')

def fit_transform(self, X, y=None, **fit_params):
def fit_transform(self, X:csr_matrix, y=None, **fit_params):
assert isinstance(X, csr_matrix)

if not 'unit_distribution' in fit_params:
Expand Down Expand Up @@ -88,7 +88,12 @@ def fit_transform(self, X, y=None, **fit_params):

return bns_featured_csr_matrix

def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true_index, verbose=False):
def docId_word_BNS(self, X:csr_matrix,
feature_index:int,
sample_index:int,
unit_distribution:np.ndarray,
true_index:int,
verbose=False):

assert isinstance(X, csr_matrix)
assert isinstance(feature_index, int)
Expand All @@ -104,7 +109,12 @@ def docId_word_BNS(self, X, feature_index, sample_index, unit_distribution, true
)
return sample_index, feature_index, bns_score

def bns(self, X, feature_index, sample_index, unit_distribution, true_index=0, verbose=False):
def bns(self, X:csr_matrix,
feature_index:int,
sample_index:int,
unit_distribution:np.ndarray,
true_index:int=0,
verbose:bool=False):
if true_index==0:
false_index = 1
elif true_index==1:
Expand Down
61 changes: 38 additions & 23 deletions DocumentFeatureSelection/common/crs_matrix_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import joblib
import sys
import logging
import numpy
from typing import List, Tuple, Dict
from scipy.sparse import csr_matrix
from sklearn.feature_extraction import DictVectorizer

logging.basicConfig(format='%(asctime)s %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
Expand All @@ -15,37 +18,48 @@
python_version = sys.version_info
__author__ = 'kensuke-mi'

PosTuple = namedtuple('PosTuple', ('doc_id', 'word_id', 'document_frequency'))

class PosTuple(object):
__slots__ = ['doc_id', 'word_id', 'document_frequency']
def __init__(self, doc_id, word_id, document_frequency):
self.doc_id = doc_id
self.word_id = word_id
self.document_frequency = document_frequency


PARAM_JOBLIB_BACKEND = ['multiprocessing', 'threading']

def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary):
assert isinstance(vocaburary, dict)
try:
col_value = vocaburary[word]
except KeyError:
print()
def get_data_col_row_values(doc_id:int, word:int, doc_freq:int, vocaburary:numpy.ndarray)->numpy.array:
"""* what you can do
- You get array of [document_id, feature_id, value(frequency)]
"""
assert isinstance(vocaburary, numpy.ndarray)
col_element = vocaburary[numpy.where(vocaburary['key']==word)]
assert len(col_element) == 1
col_value = col_element[0]['value']
# df value is word frequency in documents
df_value = doc_freq

return PosTuple(doc_id, col_value, df_value)
return numpy.array([doc_id, col_value, df_value])

def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary:numpy.ndarray)->numpy.ndarray:

value_pairs = numpy.array([
get_data_col_row_values(doc_id=doc_id, word=key_value_tuple['key'], doc_freq=key_value_tuple['value'], vocaburary=vocabulary)
for key_value_tuple
in doc_freq_obj])

def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj, vocabulary):
value_pairs = [
get_data_col_row_values(doc_id=doc_id, word=word, doc_freq=freq, vocaburary=vocabulary)
for word, freq
in doc_freq_obj.items()
]
assert isinstance(value_pairs, list)
return value_pairs

def make_csr_list(value_position_list):

def make_csr_list(value_position_list:List[numpy.array])->Tuple[List[int], List[int], List[int]]:
data = []
row = []
col = []
for position_tuple in value_position_list:
row.append(position_tuple.doc_id)
col.append(position_tuple.word_id)
data.append(position_tuple.document_frequency)
row.append(position_tuple[0])
col.append(position_tuple[1])
data.append(position_tuple[2])

return row, col, data

Expand Down Expand Up @@ -74,7 +88,7 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back
assert Exception('joblib_backend parameter must be either of {}. However your input is {}.'.format(PARAM_JOBLIB_BACKEND, joblib_backend))

assert isinstance(feature_frequency, list)
assert isinstance(vocabulary, dict)
assert isinstance(vocabulary, numpy.ndarray)
assert isinstance(n_jobs, int)

logger.debug(msg='making tuple pairs for csr matrix with n(process)={}'.format(n_jobs))
Expand All @@ -86,11 +100,12 @@ def preprocess_csr_matrix(feature_frequency, vocabulary, n_jobs:int, joblib_back
vocabulary
)
for doc_id, doc_freq_obj in enumerate(feature_frequency)
)
) # type: List[numpy.ndarray]

# make 2-d list into 1-d list
value_position_list = sorted(
[l for set in set_value_position_list for l in set],
key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2])
)
key=lambda pos_tuple: (pos_tuple[0], pos_tuple[1], pos_tuple[2]))

row, col, data = make_csr_list(value_position_list)

Expand Down
96 changes: 29 additions & 67 deletions DocumentFeatureSelection/common/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from DocumentFeatureSelection import init_logger
import logging
import sys
import numpy
import pickle
from typing import Dict, List, Tuple, Union, Any
python_version = sys.version_info
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
Expand Down Expand Up @@ -66,11 +68,12 @@ def __check_data_structure(self, labeled_documents):

return True

def count_term_frequency_distribution(self, labeled_documents, label2id_dict):

def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int]):
"""Count term-distribution per label.
"""
assert isinstance(labeled_documents, dict)
assert isinstance(label2id_dict, dict)
assert isinstance(label2id, dict)

# count total term-frequency per label
term_frequency_distribution = {
Expand All @@ -83,19 +86,22 @@ def count_term_frequency_distribution(self, labeled_documents, label2id_dict):
term_frequency_distribution_list = [0] * len(labeled_documents.keys())

for label_string, n_doc in term_frequency_distribution.items():
term_frequency_distribution_list[label2id_dict[label_string]] = n_doc
#term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
term_index = label2id[label_string]
term_frequency_distribution_list[term_index] = n_doc

return numpy.array(term_frequency_distribution_list, dtype='i8')

return term_frequency_distribution_list

def count_document_distribution(self, labeled_documents, label2id_dict):
def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int])->numpy.ndarray:
"""This method count n(docs) per label.
:param labeled_documents:
:param label2id_dict:
:return:
"""
assert isinstance(labeled_documents, dict)
assert isinstance(label2id_dict, dict)
assert isinstance(label2id, dict)

# count n(docs) per label
n_doc_distribution = {
Expand All @@ -108,9 +114,11 @@ def count_document_distribution(self, labeled_documents, label2id_dict):
n_doc_distribution_list = [0] * len(labeled_documents.keys())

for label_string, n_doc in n_doc_distribution.items():
n_doc_distribution_list[label2id_dict[label_string]] = n_doc
#docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
docs_index = label2id[label_string]
n_doc_distribution_list[docs_index] = n_doc

return n_doc_distribution_list
return numpy.array(n_doc_distribution_list, dtype='i8')

def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'):
"""This function makes TERM-frequency matrix for TF-IDF calculation.
Expand All @@ -128,49 +136,27 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
logger.debug(msg='Now pre-processing before CSR matrix')
# convert data structure
set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2TermFreqInfo(labeled_documents)
assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation)
logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature_frequency)))
if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000:
joblib_backend = 'threading'
if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000:
joblib_backend = 'multiprocessing'

# make set of tuples to construct csr_matrix
row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
feature_frequency=set_document_information.feature_frequency,
vocabulary=set_document_information.feature2id_dict,
n_jobs=n_jobs,
joblib_backend=joblib_backend
)
logger.debug(msg='Finished pre-processing before CSR matrix')
csr_matrix_ = crs_matrix_constructor.make_csr_objects(
row=row, col=col, data=data,
n_feature=max(set_document_information.feature2id_dict.values())+1,
n_docs=len(set_document_information.feature_frequency))

# count n(docs) per label
n_docs_distribution = self.count_document_distribution(
labeled_documents=labeled_documents,
label2id_dict=set_document_information.label2id_dict
label2id=set_document_information.label2id
)
# count term-frequency per label
term_frequency_distribution = self.count_term_frequency_distribution(
labeled_documents=labeled_documents,
label2id_dict=set_document_information.label2id_dict
label2id=set_document_information.label2id
)

assert isinstance(csr_matrix_, csr_matrix)
assert isinstance(set_document_information.label2id_dict, dict)
assert isinstance(set_document_information.feature2id_dict, dict)
assert isinstance(n_docs_distribution, list)
return DataCsrMatrix(
csr_matrix_,
set_document_information.label2id_dict,
set_document_information.feature2id_dict,
set_document_information.matrix_object,
set_document_information.label2id,
set_document_information.feature2id,
n_docs_distribution, term_frequency_distribution)


def labeledMultiDocs2DocFreqMatrix(self, labeled_documents,
def labeledMultiDocs2DocFreqMatrix(self,
labeled_documents:Dict[str,List[Any]],
ngram:int=1,
n_jobs:int=1,
joblib_backend:str='auto')->DataCsrMatrix:
Expand Down Expand Up @@ -226,47 +212,23 @@ def labeledMultiDocs2DocFreqMatrix(self, labeled_documents,
logger.debug(msg='Now pre-processing before CSR matrix')
# convert data structure
set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2DocFreqInfo(labeled_documents,
n_jobs=n_jobs,
joblib_backend=joblib_backend)
n_jobs=n_jobs)
assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation)
logger.info(msg='Get {} feature-dimension from your input data.'.format(len(set_document_information.feature2id_dict)))
if joblib_backend == 'auto' and len(set_document_information.feature_frequency) >= 100000:
joblib_backend = 'threading'
if joblib_backend == 'auto' and len(set_document_information.feature_frequency) < 100000:
joblib_backend = 'multiprocessing'

# make set of tuples to construct csr_matrix
row, col, data = crs_matrix_constructor.preprocess_csr_matrix(
feature_frequency=set_document_information.feature_frequency,
vocabulary=set_document_information.feature2id_dict,
n_jobs=n_jobs,
joblib_backend=joblib_backend
)
logger.debug(msg='Finished pre-processing before CSR matrix')
csr_matrix_ = crs_matrix_constructor.make_csr_objects(
row=row, col=col, data=data,
n_feature=max(set_document_information.feature2id_dict.values())+1,
n_docs=len(set_document_information.feature_frequency))

# count n(docs) per label
n_docs_distribution = self.count_document_distribution(
labeled_documents=labeled_documents,
label2id_dict=set_document_information.label2id_dict
label2id=set_document_information.label2id
)
# count term-frequency per label
term_frequency_distribution = self.count_term_frequency_distribution(
labeled_documents=labeled_documents,
label2id_dict=set_document_information.label2id_dict
label2id=set_document_information.label2id
)

assert isinstance(csr_matrix_, csr_matrix)
assert isinstance(set_document_information.label2id_dict, dict)
assert isinstance(set_document_information.feature2id_dict, dict)
assert isinstance(n_docs_distribution, list)
return DataCsrMatrix(
csr_matrix_,
set_document_information.label2id_dict,
set_document_information.feature2id_dict,
set_document_information.matrix_object,
set_document_information.label2id,
set_document_information.feature2id,
n_docs_distribution, term_frequency_distribution)


Expand Down
Loading

0 comments on commit 03c74fc

Please sign in to comment.