diff --git a/DocumentFeatureSelection/common/crs_matrix_constructor.py b/DocumentFeatureSelection/common/crs_matrix_constructor.py index 457112d..9b908d9 100644 --- a/DocumentFeatureSelection/common/crs_matrix_constructor.py +++ b/DocumentFeatureSelection/common/crs_matrix_constructor.py @@ -52,7 +52,7 @@ def SUB_FUNC_make_value_pairs(doc_id:int, doc_freq_obj:numpy.ndarray, vocabulary return value_pairs -def make_csr_list(value_position_list:List[numpy.array])->Tuple[List[int], List[int], List[int]]: +def make_csr_list(value_position_list:List[numpy.ndarray])->Tuple[List[int], List[int], List[int]]: data = [] row = [] col = [] diff --git a/DocumentFeatureSelection/common/data_converter.py b/DocumentFeatureSelection/common/data_converter.py index 15c035a..a04ebef 100644 --- a/DocumentFeatureSelection/common/data_converter.py +++ b/DocumentFeatureSelection/common/data_converter.py @@ -3,60 +3,35 @@ from __future__ import print_function from __future__ import unicode_literals from __future__ import division -from DocumentFeatureSelection.common import utils -from scipy.sparse import csr_matrix -from DocumentFeatureSelection.common import crs_matrix_constructor -from DocumentFeatureSelection.common import labeledMultiDocs2labeledDocsSet -from DocumentFeatureSelection.common import ngram_constructor -from DocumentFeatureSelection.models import DataCsrMatrix, FeatureType +from DocumentFeatureSelection.common import utils, labeledMultiDocs2labeledDocsSet, ngram_constructor +from DocumentFeatureSelection.models import DataCsrMatrix, FeatureType, AvailableInputTypes from DocumentFeatureSelection import init_logger +from scipy.sparse import csr_matrix +from sqlitedict import SqliteDict import logging import sys import numpy -import pickle -from typing import Dict, List, Tuple, Union, Any +from typing import Dict python_version = sys.version_info logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) __author__ = 'kensuke-mi' -""" - -Example: - >>> input_format = { - "label_a": [ - ["I", "aa", "aa", "aa", "aa", "aa"], - ["bb", "aa", "aa", "aa", "aa", "aa"], - ["I", "aa", "hero", "some", "ok", "aa"] - ], - "label_b": [ - ["bb", "bb", "bb"], - ["bb", "bb", "bb"], - ["hero", "ok", "bb"], - ["hero", "cc", "bb"], - ], - "label_c": [ - ["cc", "cc", "cc"], - ["cc", "cc", "bb"], - ["xx", "xx", "cc"], - ["aa", "xx", "cc"], - ] - } -""" - class DataConverter(object): - """ + """This class is for converting data type from dict-object into DataCsrMatrix-object which saves information of matrix. """ def __check_data_structure(self, labeled_documents): - # type: (Dict[str, Union[str, List[Any], Tuple[Any]]])->bool + # type: AvailableInputTypes->bool """* what you can do - This function checks input data structure """ - assert isinstance(labeled_documents, dict) - for key in labeled_documents.keys(): + assert isinstance(labeled_documents, (SqliteDict, dict)) + for key, value in labeled_documents.items(): docs_in_label = labeled_documents[key] - assert isinstance(docs_in_label, list) + if not isinstance(docs_in_label, list): + logger.error(msg=docs_in_label) + raise TypeError('It expects list object. But your object has {}'.format(type(docs_in_label))) for doc in docs_in_label: for t in doc: if isinstance(t, (str)): @@ -68,11 +43,10 @@ def __check_data_structure(self, labeled_documents): return True - - def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int]): + def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]): """Count term-distribution per label. """ - assert isinstance(labeled_documents, dict) + assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count total term-frequency per label @@ -83,7 +57,9 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any] } # make list of distribution - term_frequency_distribution_list = [0] * len(labeled_documents.keys()) + #term_frequency_distribution_list = [0] * len(labeled_documents.keys()) + # TODO + term_frequency_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in term_frequency_distribution.items(): #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] @@ -92,15 +68,10 @@ def count_term_frequency_distribution(self, labeled_documents:Dict[str,List[Any] return numpy.array(term_frequency_distribution_list, dtype='i8') - - def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], label2id:Dict[str,int])->numpy.ndarray: + def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray: """This method count n(docs) per label. - - :param labeled_documents: - :param label2id_dict: - :return: """ - assert isinstance(labeled_documents, dict) + assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count n(docs) per label @@ -111,7 +82,9 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab } # make list of distribution - n_doc_distribution_list = [0] * len(labeled_documents.keys()) + # TODO + #n_doc_distribution_list = [0] * len(labeled_documents.keys()) + n_doc_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in n_doc_distribution.items(): #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] @@ -121,6 +94,7 @@ def count_document_distribution(self, labeled_documents:Dict[str,List[Any]], lab return numpy.array(n_doc_distribution_list, dtype='i8') def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, joblib_backend='auto'): + # type: (AvailableInputTypes, int, int, str) -> DataCsrMatrix """This function makes TERM-frequency matrix for TF-IDF calculation. TERM-frequency matrix is scipy.csr_matrix. """ @@ -154,52 +128,21 @@ def labeledMultiDocs2TermFreqMatrix(self, labeled_documents, ngram=1, n_jobs=1, set_document_information.feature2id, n_docs_distribution, term_frequency_distribution) - def labeledMultiDocs2DocFreqMatrix(self, - labeled_documents:Dict[str,List[Any]], + labeled_documents:AvailableInputTypes, ngram:int=1, n_jobs:int=1, joblib_backend:str='auto')->DataCsrMatrix: - """This function makes document-frequency matrix for PMI calculation. - Document-frequency matrix is scipy.csr_matrix. - - labeled_structure must be following key-value pair - - >>> { - "label_a": [ - ["I", "aa", "aa", "aa", "aa", "aa"], - ["bb", "aa", "aa", "aa", "aa", "aa"], - ["I", "aa", "hero", "some", "ok", "aa"] - ], - "label_b": [ - ["bb", "bb", "bb"], - ["bb", "bb", "bb"], - ["hero", "ok", "bb"], - ["hero", "cc", "bb"], - ], - "label_c": [ - ["cc", "cc", "cc"], - ["cc", "cc", "bb"], - ["xx", "xx", "cc"], - ["aa", "xx", "cc"], - ] - } - - There is 3 Output data. - - vocaburary is, dict object with token: feature_id - >>> {'I_aa_hero': 4, 'xx_xx_cc': 1, 'I_aa_aa': 2, 'bb_aa_aa': 3, 'cc_cc_bb': 8} - - label_group_dict is, dict object with label_name: label_id - >>> {'label_b': 0, 'label_c': 1, 'label_a': 2} - - csr_matrix is, sparse matrix from scipy.sparse - - - :param dict labeled_structure: above data structure - :param int ngram: you can get score with ngram-words - :return: `(csr_matrix: scipy.csr_matrix, label_group_dict: dict, vocabulary: dict)` - :rtype: tuple + """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix. + + * Input object + - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below + >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]], + >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],], + >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]} + + * Output + - DataCsrMatrix object. """ self.__check_data_structure(labeled_documents) @@ -247,7 +190,7 @@ def __conv_into_dict_format(word_score_items): return out_format_structure -def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix, +def scored_matrix2score_dictionary(scored_matrix:csr_matrix, label2id_dict:Dict[str,int], feature2id_dict:Dict[FeatureType,int], outformat:str='items', @@ -259,20 +202,11 @@ def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix, If outformat='dict', you get - >>> {label_name: - { - feature: score - } - } + >>> {label_name:{feature: score}} Else if outformat='items', you get - >>> [ - { - feature: score - } - ] - + >>> [{feature: score}] """ scored_objects = utils.get_feature_dictionary( @@ -292,4 +226,7 @@ def ScoreMatrix2ScoreDictionary(scored_matrix:csr_matrix, else: raise ValueError('outformat must be either of {dict, items}') - return out_format_structure \ No newline at end of file + return out_format_structure + +# for old version code +ScoreMatrix2ScoreDictionary = scored_matrix2score_dictionary \ No newline at end of file diff --git a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py index f490eaf..1c7e257 100644 --- a/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py +++ b/DocumentFeatureSelection/common/labeledMultiDocs2labeledDocsSet.py @@ -1,10 +1,9 @@ -from collections import namedtuple from collections import Counter -from DocumentFeatureSelection.common import utils -from DocumentFeatureSelection.models import SetDocumentInformation +from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes from DocumentFeatureSelection import init_logger from sklearn.feature_extraction import DictVectorizer from typing import Dict, List, Tuple, Any, Union +from sqlitedict import SqliteDict import logging import joblib import itertools @@ -30,13 +29,10 @@ def generate_document_dict(document_key:str, return (document_key, document_frequencies) -def multiDocs2TermFreqInfo(labeled_documents): +def multiDocs2TermFreqInfo(labeled_documents:AvailableInputTypes): """This function generates information to construct term-frequency matrix - - :param labeled_structure: - :return: """ - assert isinstance(labeled_documents, dict) + assert isinstance(labeled_documents, (SqliteDict, dict)) counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents)))) for label, documents in labeled_documents.items()] @@ -61,15 +57,16 @@ def judge_feature_type(docs:List[List[Union[str, Tuple[Any]]]])->str: elif isinstance(feature, tuple): type_flag = 'tuple' else: + logger.error(msg=docs) raise TypeError('Feature object should be either of str or tuple') return type_flag -def multiDocs2DocFreqInfo(labeled_documents:Dict[str, List[List[Union[str, Tuple[Any]]]]], +def multiDocs2DocFreqInfo(labeled_documents:AvailableInputTypes, n_jobs:int=1)->SetDocumentInformation: """This function generates information for constructing document-frequency matrix. """ - assert isinstance(labeled_documents, dict) + assert isinstance(labeled_documents, (SqliteDict, dict)) type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) assert len(type_flag)==1 diff --git a/DocumentFeatureSelection/interface.py b/DocumentFeatureSelection/interface.py index 0673960..e10cb81 100644 --- a/DocumentFeatureSelection/interface.py +++ b/DocumentFeatureSelection/interface.py @@ -1,14 +1,14 @@ -from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject +from DocumentFeatureSelection.models import DataCsrMatrix, ScoredResultObject, AvailableInputTypes from DocumentFeatureSelection.common import data_converter from DocumentFeatureSelection.soa.soa_python3 import SOA from DocumentFeatureSelection.pmi.PMI_python3 import PMI from DocumentFeatureSelection.tf_idf.tf_idf import TFIDF from DocumentFeatureSelection.bns.bns_python3 import BNS from DocumentFeatureSelection import init_logger -from typing import List, Dict, Any, Union, Tuple +from sqlitedict import SqliteDict +from typing import Dict from scipy.sparse.csr import csr_matrix import logging -import numpy logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) METHOD_NAMES = ['soa', 'pmi', 'tf_idf', 'bns'] N_FEATURE_SWITCH_STRATEGY = 1000000 @@ -21,7 +21,7 @@ def decide_joblib_strategy(feature2id_dict:Dict[str,int])->str: return 'multiprocessing' -def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]], +def run_feature_selection(input_dict:AvailableInputTypes, method:str, ngram:int=1, n_jobs:int=1, @@ -32,8 +32,9 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] raise Exception('method name must be either of {}. Yours: {}'.format(METHOD_NAMES, method)) if method == 'tf_idf': - # getting term-frequency matrix. - # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix + """You get scored-matrix with term-frequency. + ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix + """ matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix( labeled_documents=input_dict, ngram=ngram, @@ -46,6 +47,8 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] assert isinstance(scored_sparse_matrix, csr_matrix) elif method in ['soa', 'pmi'] and matrix_form is None: + """You get scored-matrix with either of soa or pmi. + """ matrix_data_object = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=input_dict, ngram=ngram, @@ -69,10 +72,13 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] joblib_backend=backend_strategy, use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) + else: + raise Exception() elif method == 'soa' and matrix_form == 'term_freq': - # getting term-frequency matrix. - # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix + """You get score-matrix with soa from term-frequency matrix. + ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix + """ matrix_data_object = data_converter.DataConverter().labeledMultiDocs2TermFreqMatrix( labeled_documents=input_dict, ngram=ngram, @@ -89,6 +95,9 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] assert isinstance(scored_sparse_matrix, csr_matrix) elif method == 'bns': + """You get scored-matrix with bns. + ATTENTION: #label should be 2 always. + """ if not 'positive' in input_dict: raise KeyError('input_dict must have "positive" key') if not 'negative' in input_dict: @@ -113,7 +122,6 @@ def run_feature_selection(input_dict:Dict[str,List[List[Union[str,Tuple[Any]]]]] joblib_backend=backend_strategy ) assert isinstance(scored_sparse_matrix, csr_matrix) - else: raise Exception() diff --git a/DocumentFeatureSelection/models.py b/DocumentFeatureSelection/models.py index c6f7b06..ca213b8 100644 --- a/DocumentFeatureSelection/models.py +++ b/DocumentFeatureSelection/models.py @@ -2,7 +2,82 @@ from scipy.sparse.csr import csr_matrix from DocumentFeatureSelection.common import utils from numpy.core.multiarray import array, ndarray -FeatureType = TypeVar('T', str, Tuple[Any]) +from sqlitedict import SqliteDict +import pickle, json, csv, os, shutil + +# this class is from https://code.activestate.com/recipes/576642/ +class PersistentDict(dict): + ''' Persistent dictionary with an API compatible with shelve and anydbm. + + The dict is kept in memory, so the dictionary operations run as fast as + a regular dictionary. + + Write to disk is delayed until close or sync (similar to gdbm's fast mode). + + Input file format is automatically discovered. + Output file format is selectable between pickle, json, and csv. + All three serialization formats are backed by fast C implementations. + + ''' + + def __init__(self, filename, flag='c', mode=None, format='pickle', *args, **kwds): + self.flag = flag # r=readonly, c=create, or n=new + self.mode = mode # None or an octal triple like 0644 + self.format = format # 'csv', 'json', or 'pickle' + self.filename = filename + if flag != 'n' and os.access(filename, os.R_OK): + fileobj = open(filename, 'rb' if format=='pickle' else 'r') + with fileobj: + self.load(fileobj) + dict.__init__(self, *args, **kwds) + + def sync(self): + 'Write dict to disk' + if self.flag == 'r': + return + filename = self.filename + tempname = filename + '.tmp' + fileobj = open(tempname, 'wb' if self.format=='pickle' else 'w') + try: + self.dump(fileobj) + except Exception: + os.remove(tempname) + raise + finally: + fileobj.close() + shutil.move(tempname, self.filename) # atomic commit + if self.mode is not None: + os.chmod(self.filename, self.mode) + + def close(self): + self.sync() + + def __enter__(self): + return self + + def __exit__(self, *exc_info): + self.close() + + def dump(self, fileobj): + if self.format == 'csv': + csv.writer(fileobj).writerows(self.items()) + elif self.format == 'json': + json.dump(self, fileobj, separators=(',', ':')) + elif self.format == 'pickle': + pickle.dump(dict(self), fileobj, 2) + else: + raise NotImplementedError('Unknown format: ' + repr(self.format)) + + def load(self, fileobj): + # try formats from most restrictive to least restrictive + for loader in (pickle.load, json.load, csv.reader): + fileobj.seek(0) + try: + return self.update(loader(fileobj)) + except Exception: + pass + raise ValueError('File not in a supported format') + class SetDocumentInformation(object): __slots__ = ['matrix_object', 'label2id', 'feature2id'] @@ -16,6 +91,16 @@ def __init__(self, matrix_object:Union[csr_matrix, ndarray], class DataCsrMatrix(object): + """ + vocaburary is, dict object with token: feature_id + >>> {'I_aa_hero': 4, 'xx_xx_cc': 1, 'I_aa_aa': 2, 'bb_aa_aa': 3, 'cc_cc_bb': 8} + + label_group_dict is, dict object with label_name: label_id + >>> {'label_b': 0, 'label_c': 1, 'label_a': 2} + + csr_matrix is, sparse matrix from scipy.sparse + """ + __slots__ = ['csr_matrix_', 'label2id_dict', 'vocabulary', 'n_docs_distribution', 'n_term_freq_distribution'] def __init__(self, csr_matrix_:csr_matrix, @@ -95,4 +180,10 @@ def ScoreMatrix2ScoreDictionary(self, else: raise ValueError('outformat must be either of {dict, items}') - return out_format_structure \ No newline at end of file + return out_format_structure + + +FeatureType = TypeVar('T', str, Tuple[Any]) +AvailableInputTypes = TypeVar('T', PersistentDict, + SqliteDict, + Dict[str,List[List[Union[str,Tuple[Any]]]]]) \ No newline at end of file diff --git a/README.md b/README.md index c957f55..5b43271 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,13 @@ Removed a bug when calling n_gram method of DataConverter * Resolved bottleneck poins in pre-processing * Introduced dict-vectorising in ScikitLearn - * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/example_python3.py` + * Introduced Cython in calculating PMI \& SOA. You can call them with `use_cython=True` flag. See `examples/basic_example.py` * Performance * Cython PMI takes 11.87 sec. - * Python multiprocessing PMI takes 513.541 sec. (8.55 min.) \ No newline at end of file + * Python multiprocessing PMI takes 513.541 sec. (8.55 min.) + +## 1.3.2 2016/11/29 + +* You can put persisted-dict-object on disk-drive instead of dict-object on memory. + * You can put huge dict object as data-source of `interface.run_feature_selection()` + * See example `examples/huge_data_example.py` \ No newline at end of file diff --git a/examples/example_python3.py b/examples/advanced_example.py similarity index 56% rename from examples/example_python3.py rename to examples/advanced_example.py index 7cde041..48781b3 100644 --- a/examples/example_python3.py +++ b/examples/advanced_example.py @@ -7,105 +7,6 @@ logger = logging.getLogger('sample usage') logger.level = logging.ERROR - -# ====================================================================================================== -# basic usage - -input_dict = { - "label_a": [ - ["I", "aa", "aa", "aa", "aa", "aa"], - ["bb", "aa", "aa", "aa", "aa", "aa"], - ["I", "aa", "hero", "some", "ok", "aa"] - ], - "label_b": [ - ["bb", "bb", "bb"], - ["bb", "bb", "bb"], - ["hero", "ok", "bb"], - ["hero", "cc", "bb"], - ], - "label_c": [ - ["cc", "cc", "cc"], - ["cc", "cc", "bb"], - ["xx", "xx", "cc"], - ["aa", "xx", "cc"], - ] -} - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# tf idf - -tf_idf_scored_object = interface.run_feature_selection( - input_dict=input_dict, - method='tf_idf', - ngram=1, - n_jobs=5 -) - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# pmi -pmi_scored_object = interface.run_feature_selection( - input_dict=input_dict, - method='pmi', - ngram=1, - n_jobs=1, - use_cython=False -) -pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) - -# you can use cython version pmi also -# !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place. -pmi_scored_object_cython = interface.run_feature_selection( - input_dict=input_dict, - method='pmi', - ngram=1, - n_jobs=1, - use_cython=True -) -pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# soa -soa_scored_object = interface.run_feature_selection( - input_dict=input_dict, - method='soa', - ngram=1, - n_jobs=5 -) -pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) - -soa_scored_object_cython = interface.run_feature_selection( - input_dict=input_dict, - method='soa', - ngram=1, - n_jobs=1, - use_cython=True -) -pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# bns -input_dict = { - "positive": [ - ["I", "aa", "aa", "aa", "aa", "aa"], - ["bb", "aa", "aa", "aa", "aa", "aa"], - ["I", "aa", "hero", "some", "ok", "aa"] - ], - "negative": [ - ["bb", "bb", "bb"], - ["bb", "bb", "bb"], - ["hero", "ok", "bb"], - ["hero", "cc", "bb"], - ] -} -bns_scored_object = interface.run_feature_selection( - input_dict=input_dict, - method='bns', - n_jobs=1 -) -pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary()) - - # ====================================================================================================== # expert usage # you can put complex-structure-feature as feature. diff --git a/examples/basic_example.py b/examples/basic_example.py new file mode 100644 index 0000000..08c964d --- /dev/null +++ b/examples/basic_example.py @@ -0,0 +1,106 @@ +#! -*- coding: utf-8 -*- +__author__ = 'kensuke-mi' + +from DocumentFeatureSelection import interface +import logging +import pprint +logger = logging.getLogger('sample usage') +logger.level = logging.ERROR + + +# ====================================================================================================== +# basic usage + +input_dict = { + "label_a": [ + ["I", "aa", "aa", "aa", "aa", "aa"], + ["bb", "aa", "aa", "aa", "aa", "aa"], + ["I", "aa", "hero", "some", "ok", "aa"] + ], + "label_b": [ + ["bb", "bb", "bb"], + ["bb", "bb", "bb"], + ["hero", "ok", "bb"], + ["hero", "cc", "bb"], + ], + "label_c": [ + ["cc", "cc", "cc"], + ["cc", "cc", "bb"], + ["xx", "xx", "cc"], + ["aa", "xx", "cc"], + ] +} + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# tf idf + +tf_idf_scored_object = interface.run_feature_selection( + input_dict=input_dict, + method='tf_idf', + ngram=1, + n_jobs=5 +) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# pmi +pmi_scored_object = interface.run_feature_selection( + input_dict=input_dict, + method='pmi', + ngram=1, + n_jobs=1, + use_cython=False +) +pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) + +# you can use cython version pmi also +# !Warning! The output value with "use_cython=True" is veeeery little different such as the 10th decimal place. +pmi_scored_object_cython = interface.run_feature_selection( + input_dict=input_dict, + method='pmi', + ngram=1, + n_jobs=1, + use_cython=True +) +pprint.pprint(pmi_scored_object_cython.ScoreMatrix2ScoreDictionary()) + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# soa +soa_scored_object = interface.run_feature_selection( + input_dict=input_dict, + method='soa', + ngram=1, + n_jobs=5 +) +pprint.pprint(soa_scored_object.ScoreMatrix2ScoreDictionary()) + +soa_scored_object_cython = interface.run_feature_selection( + input_dict=input_dict, + method='soa', + ngram=1, + n_jobs=1, + use_cython=True +) +pprint.pprint(soa_scored_object_cython.ScoreMatrix2ScoreDictionary()) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# bns +input_dict = { + "positive": [ + ["I", "aa", "aa", "aa", "aa", "aa"], + ["bb", "aa", "aa", "aa", "aa", "aa"], + ["I", "aa", "hero", "some", "ok", "aa"] + ], + "negative": [ + ["bb", "bb", "bb"], + ["bb", "bb", "bb"], + ["hero", "ok", "bb"], + ["hero", "cc", "bb"], + ] +} +bns_scored_object = interface.run_feature_selection( + input_dict=input_dict, + method='bns', + n_jobs=1 +) +pprint.pprint(bns_scored_object.ScoreMatrix2ScoreDictionary()) diff --git a/examples/huge_data_example.py b/examples/huge_data_example.py new file mode 100644 index 0000000..19c7925 --- /dev/null +++ b/examples/huge_data_example.py @@ -0,0 +1,57 @@ +from nltk.corpus import gutenberg +from nltk.corpus import webtext +from nltk.corpus import genesis +from nltk.corpus import abc +from DocumentFeatureSelection import interface +from DocumentFeatureSelection.models import PersistentDict +from sqlitedict import SqliteDict +import time +import os + +"""This example shows you how to work on huge dataset. +For persisted-dict object you can choose PersistentDict or SqliteDict +You're supposed to be ready to use following corpora object in nltk +- abc +- genesis +- web +- gutenberg +""" + +#---------------------------------------------------------- +abc_corpus = abc.sents() +genesis_corpus = genesis.sents() +web_corpus = webtext.sents() +gutenberg_corpus = gutenberg.sents() + +# Case of PersistentDict +persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') +persistent_dict_obj['abc'] = list(abc_corpus) +persistent_dict_obj['genesis'] = list(genesis_corpus) +persistent_dict_obj['web'] = list(web_corpus) +persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) + +start = time.time() +scored_matrix_obj = interface.run_feature_selection( + input_dict=persistent_dict_obj, + method='pmi', + use_cython=True + ) +elapsed_time = time.time() - start +print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) + +# Case of SqliteDict +persisten_sqlite3_dict_obj = SqliteDict('./my_db.sqlite', autocommit=True) +persisten_sqlite3_dict_obj['abc'] = list(abc_corpus) +persisten_sqlite3_dict_obj['genesis'] = list(genesis_corpus) +persisten_sqlite3_dict_obj['web'] = list(web_corpus) +persisten_sqlite3_dict_obj['gutenberg'] = list(gutenberg_corpus) + +start = time.time() +scored_matrix_obj_ = interface.run_feature_selection( + input_dict=persisten_sqlite3_dict_obj, + method='pmi', + use_cython=True + ) +elapsed_time = time.time() - start +print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) +os.remove('./my_db.sqlite') \ No newline at end of file diff --git a/setup.py b/setup.py index 4a91f12..04ffe69 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ """ __author__ = 'kensuke-mi' -__version__ = '1.3.1' +__version__ = '1.3.2' import sys import pip @@ -60,7 +60,7 @@ if python_version >= (3, 0, 0): install_requires = ['six', 'setuptools>=1.0', 'joblib', 'numpy', - 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython'] + 'scipy', 'nltk', 'scikit-learn', 'pypandoc', 'cython', 'sqlitedict'] else: raise Exception('This package does NOT support Python2.x') diff --git a/tests/test_interface.py b/tests/test_interface.py new file mode 100644 index 0000000..bd95c7d --- /dev/null +++ b/tests/test_interface.py @@ -0,0 +1,73 @@ +import unittest +from DocumentFeatureSelection import interface +from DocumentFeatureSelection.models import ScoredResultObject +from DocumentFeatureSelection.models import PersistentDict +from sqlitedict import SqliteDict +import os +import numpy + +class TestInterface(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.input_dict = { + "label_a": [ + ["I", "aa", "aa", "aa", "aa", "aa"], + ["bb", "aa", "aa", "aa", "aa", "aa"], + ["I", "aa", "hero", "some", "ok", "aa"] + ], + "label_b": [ + ["bb", "bb", "bb"], + ["bb", "bb", "bb"], + ["hero", "ok", "bb"], + ["hero", "cc", "bb"], + ], + "label_c": [ + ["cc", "cc", "cc"], + ["cc", "cc", "bb"], + ["xx", "xx", "cc"], + ["aa", "xx", "cc"], + ] + } + cls.method = ['pmi', 'tf_idf', 'soa'] + cls.bool_cython = [False, True] + cls.joblib_range = range(0, 2) + cls.path_shelve_file = './shelve' + cls.path_sqlite3_persistent = './temp_db.sqlite3' + + @classmethod + def tearDownClass(cls): + os.remove(cls.path_sqlite3_persistent) + + def test_interface_shelve(self): + shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json') + for key, value in self.input_dict.items(): shelve_obj[key] = value + + sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True) + for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value + + for method_name in self.method: + for cython_flag in self.bool_cython: + scored_result_persisted = interface.run_feature_selection( + input_dict=shelve_obj, + method=method_name, use_cython=cython_flag) # type: ScoredResultObject + self.assertIsInstance(scored_result_persisted, ScoredResultObject) + self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list) + + scored_result_sqlite3_persisted = interface.run_feature_selection( + input_dict=sqlite3_dict_obj, + method=method_name, use_cython=cython_flag) # type: ScoredResultObject + self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) + self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list) + + # You check if result is same between data-source = shelve_obj and data-source = dict-object + scored_result_dict = interface.run_feature_selection( + input_dict=self.input_dict, + method=method_name, use_cython=cython_flag) # type: ScoredResultObject + self.assertIsInstance(scored_result_dict, ScoredResultObject) + self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list) + + numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) + numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) + +if __name__ == '__main__': + unittest.main()