Merge branch 'release/1.0.1' into stable

EducationalTestingService · Feb 20, 2015 · 77e91a4 · 77e91a4
2 parents d116ccd + 22b62c9
commit 77e91a4
Show file tree

Hide file tree

Showing 9 changed files with 213 additions and 47 deletions.
diff --git a/README.rst b/README.rst
@@ -1,52 +1,114 @@
 SciKit-Learn Laboratory
 -----------------------
 
-.. image:: https://travis-ci.org/EducationalTestingService/skll.svg?branch=stable
+.. image:: https://img.shields.io/travis/EducationalTestingService/skll/stable.svg
    :alt: Build status
    :target: https://travis-ci.org/EducationalTestingService/skll
 
-.. image:: http://img.shields.io/coveralls/EducationalTestingService/skll/stable.svg
+.. image:: https://img.shields.io/coveralls/EducationalTestingService/skll/stable.svg
     :target: https://coveralls.io/r/EducationalTestingService/skll
 
-.. image:: http://img.shields.io/pypi/dm/skll.svg
+.. image:: https://img.shields.io/pypi/dm/skll.svg
    :target: https://warehouse.python.org/project/skll/
    :alt: PyPI downloads
 
-.. image:: http://img.shields.io/pypi/v/skll.svg
+.. image:: https://img.shields.io/pypi/v/skll.svg
    :target: https://warehouse.python.org/project/skll/
    :alt: Latest version on PyPI
 
-.. image:: http://img.shields.io/pypi/l/skll.svg
+.. image:: https://img.shields.io/pypi/l/skll.svg
    :alt: License
 
-.. image:: http://img.shields.io/badge/DOI-10.5281%2Fzenodo.12825-blue.svg
+.. image:: https://img.shields.io/badge/DOI-10.5281%2Fzenodo.12825-blue.svg
    :target: http://dx.doi.org/10.5281/zenodo.12825
    :alt: DOI for citing SKLL 1.0.0
 
-This Python package provides utilities to make it easier to run
-machine learning experiments with scikit-learn.
+This Python package provides command-line utilities to make it easier to run
+machine learning experiments with scikit-learn.  One of the primary goals of
+our project is to make it so that you can run scikit-learn experiments without
+actually needing to write any code other than what you used to generate/extract
+the features.
 
 Command-line Interface
 ~~~~~~~~~~~~~~~~~~~~~~
 
-``run_experiment`` is a command-line utility for running a series of learners on
-datasets specified in a configuration file. For more information about using
-run_experiment (including a quick example), go
-`here <https://skll.readthedocs.org/en/latest/run_experiment.html>`__.
+The main utility we provide is called ``run_experiment`` and it can be used to
+easily run a series of learners on datasets specified in a configuration file 
+like:
+
+.. code:: ini
+
+  [General]
+  experiment_name = Titanic_Evaluate_Tuned
+  # valid tasks: cross_validate, evaluate, predict, train
+  task = evaluate
+  
+  [Input]
+  # these directories could also be absolute paths 
+  # (and must be if you're not running things in local mode)
+  train_directory = train
+  test_directory = dev
+  # Can specify multiple sets of feature files that are merged together automatically
+  # (even across formats)
+  featuresets = [["family.ndj", "misc.csv", "socioeconomic.arff", "vitals.csv"]]
+  # List of scikit-learn learners to use
+  learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"]
+  # Column in CSV containing labels to predict
+  label_col = Survived
+  # Column in CSV containing instance IDs (if any)
+  id_col = PassengerId
+  
+  [Tuning]
+  # Should we tune parameters of all learners by searching provided parameter grids?
+  grid_search = true
+  # Function to maximize when performing grid search
+  objective = accuracy
+  
+  [Output]
+  # again, these can/should be absolute paths
+  log = output
+  results = output
+  predictions = output
+  models = output
+
+
+For more information about getting started with ``run_experiment``, please check
+out `our tutorial <https://skll.readthedocs.org/en/latest/tutorial.html>`__, or
+`our config file specs <https://skll.readthedocs.org/en/latest/run_experiment.html>`__.
+
+We also provide utilities for:
+
+-  `converting between machine learning toolkit formats <https://skll.readthedocs.org/en/latest/utilities.html#skll-convert>`__
+   (e.g., ARFF, CSV, MegaM)
+-  `filtering feature files <https://skll.readthedocs.org/en/latest/utilities.html#filter-features>`__
+-  `joining feature files <https://skll.readthedocs.org/en/latest/utilities.html#join-features>`__
+-  `other common tasks <https://skll.readthedocs.org/en/latest/utilities.html>`__
+
 
 Python API
 ~~~~~~~~~~
 
 If you just want to avoid writing a lot of boilerplate learning code, you can
-use our simple Python API. The main way you'll want to use the API is through
-the ``Learner`` and ``Reader`` classes. For more details on how to simply
-train, test, cross-validate, and run grid search on a variety of scikit-learn
-models see
-`the documentation <https://skll.readthedocs.org/en/latest/index.html>`__.
+also use our simple Python API. The main way you'll want to use the API is through
+the ``Learner`` and ``Reader`` classes. For more details on our API, see
+`the documentation <https://skll.readthedocs.org/en/latest/api.html>`__.
+
+While our API can be broadly useful, it should be noted that the command-line 
+utilities are intended as the primary way of using SKLL.  The API is just a nice
+side-effect of our developing the utilities.
+
 
 A Note on Pronunciation
 ~~~~~~~~~~~~~~~~~~~~~~~
 
+.. image:: doc/skll.png
+   :alt: SKLL logo
+   :align: right
+
+.. container:: clear
+
+  .. image:: doc/spacer.png
+
 SciKit-Learn Laboratory (SKLL) is pronounced "skull": that's where the learning
 happens.
 
@@ -70,6 +132,7 @@ Requirements
 Talks
 ~~~~~
 
+-  *Simpler Machine Learning with SKLL 1.0*, Dan Blanchard, PyData NYC 2014 (`video <https://www.youtube.com/watch?v=VEo2shBuOrc&feature=youtu.be&t=1s>`__ | `slides <http://www.slideshare.net/DanielBlanchard2/py-data-nyc-2014>`__)
 -  *Simpler Machine Learning with SKLL*, Dan Blanchard, PyData NYC 2013 (`video <http://vimeo.com/79511496>`__ | `slides <http://www.slideshare.net/DanielBlanchard2/simple-machine-learning-with-skll>`__)
 
 Books

diff --git a/conda.yaml b/conda.yaml
@@ -1,6 +1,6 @@
 package:
   name: skll
-  version: {{ environ['GIT_DESCRIBE_TAG'].replace('v', '') }}
+  version: {{ environ.get('GIT_DESCRIBE_TAG', '').replace('v', '') }}
 
 source:
   git_url: ./

diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -145,8 +145,8 @@ possible settings for each section is provided below, but to summarize:
 .. _evaluate:
 
 *   If you want to **train a model and evaluate it** on some data, specify a
-    training location, a test location, and a directory to store to store
-    results, and set :ref:`task` to ``evaluate``.
+    training location, a test location, and a directory to store results, 
+    and set :ref:`task` to ``evaluate``.
 
 .. _predict:
 
@@ -388,10 +388,13 @@ Any labels not included in the dictionary will be left untouched.
 cv_folds_file *(Optional)*
 """"""""""""""""""""""""""""""
 
-Path to a csv file (with a header that is ignored) specifying folds for cross-
-validation. The first column should consist of training set IDs and the second
-should be a string for the fold ID (e.g., 1 through 5, A through D, etc.).  If
-specified, the CV and grid search will leave one fold ID out at a time. [#]_
+Path to a csv file specifying folds for cross-validation. The first row must be
+a header. This header row is ignored, so it doesn't matter what the header row
+contains, but it must be there. If there is no header row, whatever row is in
+its place will be ignored. The first column should consist of training set IDs
+and the second should be a string for the fold ID (e.g., 1 through 5, A through
+D, etc.).  If specified, the CV and grid search will leave one fold ID out at a
+time. [#]_
 
 .. _custom_learner_path:
 
@@ -677,24 +680,45 @@ Defaults to ``f1_score_micro``.
 param_grids *(Optional)*
 """"""""""""""""""""""""
 
-List of parameter grids to search for each classifier. Each parameter
+List of parameter grids to search for each learner. Each parameter
 grid should be a list of dictionaries mapping from strings to lists
-of parameter values. When you specify an empty list for a classifier,
-the default parameter grid for that classifier will be searched.
+of parameter values. When you specify an empty list for a learner,
+the default parameter grid for that learner will be searched.
 
-The default parameter grids for each classifier are:
+The default parameter grids for each learner are:
 
-LogisticRegression
+AdaBoostClassifier and AdaBoostRegressor
     .. code-block:: python
 
-       [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}]
+        [{'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}]
+
+DecisionTreeClassifier and DecisionTreeRegressor
+    .. code-block:: python
+
+       [{'max_features': ["auto", None]}]
+
+ElasticNet, Lasso, and Ridge
+    .. code-block:: python
+
+       [{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}]
+
+GradientBoostingClassifier and GradientBoostingRegressor
+    .. code-block:: python
+
+       [{'max_depth': [1, 3, 5]}]
+
+KNeighborsClassifier and KNeighborsRegressor
+    .. code-block:: python
+
+        [{'n_neighbors': [1, 5, 10, 100],
+          'weights': ['uniform', 'distance']}]
 
 LinearSVC
     .. code-block:: python
 
        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}]
 
-SVC
+LogisticRegression
     .. code-block:: python
 
        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}]
@@ -704,25 +728,22 @@ MultinomialNB
 
        [{'alpha': [0.1, 0.25, 0.5, 0.75, 1.0]}]
 
-DecisionTreeClassifier and DecisionTreeRegressor
-    .. code-block:: python
-
-       [{'max_features': ["auto", None]}]
-
 RandomForestClassifier and RandomForestRegressor
     .. code-block:: python
 
        [{'max_depth': [1, 5, 10, None]}]
 
-GradientBoostingClassifier and GradientBoostingRegressor
+SGDClassifier and SGDRegressor
     .. code-block:: python
 
-       [{'max_depth': [1, 3, 5], 'n_estimators': [500]}]
+        [{'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
+          'penalty': ['l1', 'l2', 'elasticnet']}]
 
-ElasticNet, Lasso, and Ridge
+SVC
     .. code-block:: python
 
-       [{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}]
+       [{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
+         'gamma': [0.01, 0.1, 1.0, 10.0, 100.0]}]
 
 SVR
     .. code-block:: python
@@ -808,8 +829,10 @@ specified via command-line arguments instead of in the configuration file:
     specified number of feature files in each featureset in the
     configuration file held out. For example, if you have three feature
     files (``A``, ``B``, and ``C``) in your featureset and you specifiy
-    ``--ablation 1``, there will be three three experiments conducted with
-    the following featuresets: ``[[A, B], [B, C], [A, C]]``.
+    ``--ablation 1``, there will be three experiments conducted with
+    the following featuresets: ``[[A, B], [B, C], [A, C]]``. Additionally,
+    since every ablation experiment includes a run with all the features as a
+    baseline, the following featureset will also be run: ``[[A, B, C]]``.
 
     If you would like to try all possible combinations of feature files, you
     can use the :option:`run_experiment --ablation_all` option instead.

diff --git a/skll/data/featureset.py b/skll/data/featureset.py
@@ -1,6 +1,6 @@
 # License: BSD 3 clause
 """
-labels related to storing/merging feature sets.
+Classes related to storing/merging feature sets.
 
 :author: Dan Blanchard ([email protected])
 :organization: ETS

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -16,6 +16,7 @@
 import json
 import logging
 import math
+import numpy as np
 import os
 import sys
 from collections import defaultdict
@@ -51,6 +52,21 @@
                              'AdditiveChi2Sampler', ''])
 
 
+class NumpyTypeEncoder(json.JSONEncoder):
+    '''
+    This class is used when serializing results, particularly the input label
+    values if the input has int-valued labels.  Numpy int64 objects can't
+    be serialized by the json module, so we must convert them to int objects.
+
+    A related issue where this was adapted from:
+    http://stackoverflow.com/questions/11561932/why-does-json-dumpslistnp-arange5-fail-while-json-dumpsnp-arange5-tolis
+    '''
+    def default(self, obj):
+        if isinstance(obj, np.int64):
+            return int(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
 def _get_stat_float(label_result_dict, stat):
     """
     Little helper for getting output for precision, recall, and f-score
@@ -824,7 +840,7 @@ def _classify_featureset(args):
             # write out the result dictionary to a json file
             file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
             with open(results_json_path, file_mode) as json_file:
-                json.dump(res, json_file)
+                json.dump(res, json_file, cls=NumpyTypeEncoder)
 
             with open(join(results_path,
                            '{}.results'.format(job_name)),

diff --git a/skll/version.py b/skll/version.py
@@ -7,5 +7,5 @@
 :organization: ETS
 """
 
-__version__ = '1.0.0'
+__version__ = '1.0.1'
 VERSION = tuple(int(x) for x in __version__.split('.'))
diff --git a/tests/configs/test_int_labels_cv.template.cfg b/tests/configs/test_int_labels_cv.template.cfg
@@ -0,0 +1,11 @@
+[General]
+experiment_name=test_int_labels_cv
+task=cross_validate
+
+[Input]
+learners=["Ridge"]
+suffix=.jsonlines
+
+[Output]
+
+[Tuning]
diff --git a/tests/other/test_int_labels_cv.jsonlines b/tests/other/test_int_labels_cv.jsonlines
@@ -0,0 +1,27 @@
+{"x": {"f1": 2.1, "f2": -3}, "y": 2}
+{"x": {"f1": 3.13, "f2": -5}, "y": 3}
+{"x": {"f1": 0.94, "f2": -6}, "y": 2}
+{"x": {"f1": 4.3, "f2": -3}, "y": 4}
+{"x": {"f1": 6.15, "f2": -1}, "y": 5}
+{"x": {"f1": 2.26, "f2": -3}, "y": 2}
+{"x": {"f1": 3.4, "f2": -5}, "y": 3}
+{"x": {"f1": 1.87, "f2": -1}, "y": 2}
+{"x": {"f1": 0.14, "f2": -2}, "y": 1}
+{"x": {"f1": 2.1, "f2": -3}, "y": 2}
+{"x": {"f1": 3.13, "f2": -5}, "y": 3}
+{"x": {"f1": 0.93, "f2": -6}, "y": 2}
+{"x": {"f1": 4.33, "f2": -3}, "y": 4}
+{"x": {"f1": 6.14, "f2": -1}, "y": 5}
+{"x": {"f1": 2.2, "f2": -3}, "y": 2}
+{"x": {"f1": 3.4, "f2": -5}, "y": 3}
+{"x": {"f1": 1.83, "f2": -1}, "y": 2}
+{"x": {"f1": 0.12, "f2": -2}, "y": 1}
+{"x": {"f1": 2.12, "f2": -3}, "y": 2}
+{"x": {"f1": 3.1, "f2": -5}, "y": 3}
+{"x": {"f1": 0.91, "f2": -6}, "y": 2}
+{"x": {"f1": 4.3, "f2": -3}, "y": 4}
+{"x": {"f1": 6.13, "f2": -1}, "y": 5}
+{"x": {"f1": 2.22, "f2": -3}, "y": 2}
+{"x": {"f1": 3.43, "f2": -5}, "y": 3}
+{"x": {"f1": 1.81, "f2": -1}, "y": 2}
+{"x": {"f1": 0.11, "f2": -2}, "y": 1}