Skip to content

Commit

Permalink
Merge pull request #18 from Kensuke-Mitsuzawa/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
Kensuke-Mitsuzawa authored Nov 28, 2016
2 parents 03c74fc + 9a52616 commit cbd2e17
Show file tree
Hide file tree
Showing 11 changed files with 405 additions and 229 deletions.
2 changes: 1 addition & 1 deletion DocumentFeatureSelection/common/crs_matrix_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary
return value_pairs


def make_csr_list(value_position_list:List[numpy.array])->Tuple[List[int], List[int], List[int]]:
def make_csr_list(value_position_list:List[numpy.ndarray])->Tuple[List[int], List[int], List[int]]:
data = []
row = []
col = []
Expand Down
145 changes: 41 additions & 104 deletions DocumentFeatureSelection/common/data_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,35 @@
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from DocumentFeatureSelection.common import utils
from scipy.sparse import csr_matrix
from DocumentFeatureSelection.common import crs_matrix_constructor
from DocumentFeatureSelection.common import labeledMultiDocs2labeledDocsSet
from DocumentFeatureSelection.common import ngram_constructor
from DocumentFeatureSelection.models import DataCsrMatrix, FeatureType
from DocumentFeatureSelection.common import utils, labeledMultiDocs2labeledDocsSet, ngram_constructor
from DocumentFeatureSelection.models import DataCsrMatrix, FeatureType, AvailableInputTypes
from DocumentFeatureSelection import init_logger
from scipy.sparse import csr_matrix
from sqlitedict import SqliteDict
import logging
import sys
import numpy
import pickle
from typing import Dict, List, Tuple, Union, Any
from typing import Dict
python_version = sys.version_info
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))

__author__ = 'kensuke-mi'

"""
Example:
>>> input_format = {
"label_a": [
["I", "aa", "aa", "aa", "aa", "aa"],
["bb", "aa", "aa", "aa", "aa", "aa"],
["I", "aa", "hero", "some", "ok", "aa"]
],
"label_b": [
["bb", "bb", "bb"],
["bb", "bb", "bb"],
["hero", "ok", "bb"],
["hero", "cc", "bb"],
],
"label_c": [
["cc", "cc", "cc"],
["cc", "cc", "bb"],
["xx", "xx", "cc"],
["aa", "xx", "cc"],
]
}
"""


class DataConverter(object):
"""
"""This class is for converting data type from dict-object into DataCsrMatrix-object which saves information of matrix.
"""
def __check_data_structure(self, labeled_documents):
# type: (Dict[str, Union[str, List[Any], Tuple[Any]]])->bool
# type: AvailableInputTypes->bool
"""* what you can do
- This function checks input data structure
"""
assert isinstance(labeled_documents, dict)
for key in labeled_documents.keys():
assert isinstance(labeled_documents, (SqliteDict, dict))
for key, value in labeled_documents.items():
docs_in_label = labeled_documents[key]
assert isinstance(docs_in_label, list)
if not isinstance(docs_in_label, list):
logger.error(msg=docs_in_label)
raise TypeError('It expects list object. But your object has {}'.format(type(docs_in_label)))
for doc in docs_in_label:
for t in doc:
if isinstance(t, (str)):
Expand All @@ -68,11 +43,10 @@ def __check_data_structure(self, labeled_documents):

return True


def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int]):
def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]):
"""Count term-distribution per label.
"""
assert isinstance(labeled_documents, dict)
assert isinstance(labeled_documents, (SqliteDict, dict))
assert isinstance(label2id, dict)

# count total term-frequency per label
Expand All @@ -83,7 +57,9 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]
}

# make list of distribution
term_frequency_distribution_list = [0] * len(labeled_documents.keys())
#term_frequency_distribution_list = [0] * len(labeled_documents.keys())
# TODO
term_frequency_distribution_list = [0] * len(labeled_documents)

for label_string, n_doc in term_frequency_distribution.items():
#term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
Expand All @@ -92,15 +68,10 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]

return numpy.array(term_frequency_distribution_list, dtype='i8')


def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int])->numpy.ndarray:
def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray:
"""This method count n(docs) per label.
:param labeled_documents:
:param label2id_dict:
:return:
"""
assert isinstance(labeled_documents, dict)
assert isinstance(labeled_documents, (SqliteDict, dict))
assert isinstance(label2id, dict)

# count n(docs) per label
Expand All @@ -111,7 +82,9 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab
}

# make list of distribution
n_doc_distribution_list = [0] * len(labeled_documents.keys())
# TODO
#n_doc_distribution_list = [0] * len(labeled_documents.keys())
n_doc_distribution_list = [0] * len(labeled_documents)

for label_string, n_doc in n_doc_distribution.items():
#docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value']
Expand All @@ -121,6 +94,7 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab
return numpy.array(n_doc_distribution_list, dtype='i8')

def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'):
# type: (AvailableInputTypes, int, int, str) -> DataCsrMatrix
"""This function makes TERM-frequency matrix for TF-IDF calculation.
TERM-frequency matrix is scipy.csr_matrix.
"""
Expand Down Expand Up @@ -154,52 +128,21 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1,
set_document_information.feature2id,
n_docs_distribution, term_frequency_distribution)


def labeledMultiDocs2DocFreqMatrix(self,
labeled_documents:Dict[str,List[Any]],
labeled_documents:AvailableInputTypes,
ngram:int=1,
n_jobs:int=1,
joblib_backend:str='auto')->DataCsrMatrix:
"""This function makes document-frequency matrix for PMI calculation.
Document-frequency matrix is scipy.csr_matrix.
labeled_structure must be following key-value pair
>>> {
"label_a": [
["I", "aa", "aa", "aa", "aa", "aa"],
["bb", "aa", "aa", "aa", "aa", "aa"],
["I", "aa", "hero", "some", "ok", "aa"]
],
"label_b": [
["bb", "bb", "bb"],
["bb", "bb", "bb"],
["hero", "ok", "bb"],
["hero", "cc", "bb"],
],
"label_c": [
["cc", "cc", "cc"],
["cc", "cc", "bb"],
["xx", "xx", "cc"],
["aa", "xx", "cc"],
]
}
There is 3 Output data.
vocaburary is, dict object with token: feature_id
>>> {'I_aa_hero': 4, 'xx_xx_cc': 1, 'I_aa_aa': 2, 'bb_aa_aa': 3, 'cc_cc_bb': 8}
label_group_dict is, dict object with label_name: label_id
>>> {'label_b': 0, 'label_c': 1, 'label_a': 2}
csr_matrix is, sparse matrix from scipy.sparse
:param dict labeled_structure: above data structure
:param int ngram: you can get score with ngram-words
:return: `(csr_matrix: scipy.csr_matrix, label_group_dict: dict, vocabulary: dict)`
:rtype: tuple
"""This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix.
* Input object
- "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below
>>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]],
>>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],],
>>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]}
* Output
- DataCsrMatrix object.
"""
self.__check_data_structure(labeled_documents)

Expand Down Expand Up @@ -247,7 +190,7 @@ def __conv_into_dict_format(word_score_items):
return out_format_structure


def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix,
def scored_matrix2score_dictionary(scored_matrix:csr_matrix,
label2id_dict:Dict[str,int],
feature2id_dict:Dict[FeatureType,int],
outformat:str='items',
Expand All @@ -259,20 +202,11 @@ def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix,
If outformat='dict', you get
>>> {label_name:
{
feature: score
}
}
>>> {label_name:{feature: score}}
Else if outformat='items', you get
>>> [
{
feature: score
}
]
>>> [{feature: score}]
"""

scored_objects = utils.get_feature_dictionary(
Expand All @@ -292,4 +226,7 @@ def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix,
else:
raise ValueError('outformat must be either of {dict, items}')

return out_format_structure
return out_format_structure

# for old version code
ScoreMatrix2ScoreDictionary = scored_matrix2score_dictionary
17 changes: 7 additions & 10 deletions DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from collections import namedtuple
from collections import Counter
from DocumentFeatureSelection.common import utils
from DocumentFeatureSelection.models import SetDocumentInformation
from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes
from DocumentFeatureSelection import init_logger
from sklearn.feature_extraction import DictVectorizer
from typing import Dict, List, Tuple, Any, Union
from sqlitedict import SqliteDict
import logging
import joblib
import itertools
Expand All @@ -30,13 +29,10 @@ def generate_document_dict(document_key:str,
return (document_key, document_frequencies)


def multiDocs2TermFreqInfo(labeled_documents):
def multiDocs2TermFreqInfo(labeled_documents:AvailableInputTypes):
"""This function generates information to construct term-frequency matrix
:param labeled_structure:
:return:
"""
assert isinstance(labeled_documents, dict)
assert isinstance(labeled_documents, (SqliteDict, dict))

counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents))))
for label, documents in labeled_documents.items()]
Expand All @@ -61,15 +57,16 @@ def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str:
elif isinstance(feature, tuple):
type_flag = 'tuple'
else:
logger.error(msg=docs)
raise TypeError('Feature object should be either of str or tuple')
return type_flag


def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple[Any]]]]],
def multiDocs2DocFreqInfo(labeled_documents:AvailableInputTypes,
n_jobs:int=1)->SetDocumentInformation:
"""This function generates information for constructing document-frequency matrix.
"""
assert isinstance(labeled_documents, dict)
assert isinstance(labeled_documents, (SqliteDict, dict))
type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()])
assert len(type_flag)==1

Expand Down
26 changes: 17 additions & 9 deletions DocumentFeatureSelection/interface.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject
from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject, AvailableInputTypes
from DocumentFeatureSelection.common import data_converter
from DocumentFeatureSelection.soa.soa_python3 import SOA
from DocumentFeatureSelection.pmi.PMI_python3 import PMI
from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF
from DocumentFeatureSelection.bns.bns_python3 import BNS
from DocumentFeatureSelection import init_logger
from typing import List, Dict, Any, Union, Tuple
from sqlitedict import SqliteDict
from typing import Dict
from scipy.sparse.csr import csr_matrix
import logging
import numpy
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns']
N_FEATURE_SWITCH_STRATEGY = 1000000
Expand All @@ -21,7 +21,7 @@ def decide_joblib_strategy(feature2id_dict:Dict[str,int])->str:
return 'multiprocessing'


def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]],
def run_feature_selection(input_dict:AvailableInputTypes,
method:str,
ngram:int=1,
n_jobs:int=1,
Expand All @@ -32,8 +32,9 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method))

if method == 'tf_idf':
# getting term-frequency matrix.
# ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
"""You get scored-matrix with term-frequency.
ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
"""
matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
labeled_documents=input_dict,
ngram=ngram,
Expand All @@ -46,6 +47,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
assert isinstance(scored_sparse_matrix, csr_matrix)

elif method in ['soa', 'pmi'] and matrix_form is None:
"""You get scored-matrix with either of soa or pmi.
"""
matrix_data_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix(
labeled_documents=input_dict,
ngram=ngram,
Expand All @@ -69,10 +72,13 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
joblib_backend=backend_strategy,
use_cython=use_cython)
assert isinstance(scored_sparse_matrix, csr_matrix)
else:
raise Exception()

elif method == 'soa' and matrix_form == 'term_freq':
# getting term-frequency matrix.
# ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
"""You get score-matrix with soa from term-frequency matrix.
ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix
"""
matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix(
labeled_documents=input_dict,
ngram=ngram,
Expand All @@ -89,6 +95,9 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
assert isinstance(scored_sparse_matrix, csr_matrix)

elif method == 'bns':
"""You get scored-matrix with bns.
ATTENTION: #label should be 2 always.
"""
if not 'positive' in input_dict:
raise KeyError('input_dict must have "positive" key')
if not 'negative' in input_dict:
Expand All @@ -113,7 +122,6 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]]
joblib_backend=backend_strategy
)
assert isinstance(scored_sparse_matrix, csr_matrix)

else:
raise Exception()

Expand Down
Loading

0 comments on commit cbd2e17

Please sign in to comment.