pydata · vnavkal · May 15, 2017 · May 15, 2017
diff --git a/patsy/build.py b/patsy/build.py
@@ -18,6 +18,7 @@
 from patsy.util import (atleast_2d_column_default,
                         have_pandas, asarray_or_pandas,
                         safe_issubdtype)
+from patsy.desc import INTERCEPT
 from patsy.design_info import (DesignMatrix, DesignInfo,
                                FactorInfo, SubtermInfo)
 from patsy.redundancy import pick_contrasts_for_term
@@ -570,7 +571,8 @@ def next(self):
 
 def _make_subterm_infos(terms,
                         num_column_counts,
-                        cat_levels_contrasts):
+                        cat_levels_contrasts,
+                        implicit_intercept):
     # Sort each term into a bucket based on the set of numeric factors it
     # contains:
     term_buckets = OrderedDict()
@@ -601,9 +603,14 @@ def _make_subterm_infos(terms,
         used_subterms = set()
         for term in bucket_terms:
             subterm_infos = []
+            # If this bucket is empty, the unique noncategorical subterm is the
+            # intercept, so it should be skipped in pick_contrasts_for_term
+            # if implicit_intercept is True.
+            skip_noncategorical_subterm = implicit_intercept and not bucket
             factor_codings = pick_contrasts_for_term(term,
                                                      num_column_counts,
-                                                     used_subterms)
+                                                     used_subterms,
+                                                     skip_noncategorical_subterm)
             # Construct one SubtermInfo for each subterm
             for factor_coding in factor_codings:
                 subterm_factors = []
@@ -636,7 +643,7 @@ def _make_subterm_infos(terms,
     return term_to_subterm_infos
 
 def design_matrix_builders(termlists, data_iter_maker, eval_env,
-                           NA_action="drop"):
+                           NA_action="drop", implicit_intercept=False):
     """Construct several :class:`DesignInfo` objects from termlists.
 
     This is one of Patsy's fundamental functions. This function and
@@ -661,6 +668,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
     :arg NA_action: An :class:`NAAction` object or string, used to determine
       what values count as 'missing' for purposes of determining the levels of
       categorical factors.
+    :arg implicit_intercept: A boolean value, default `False`.  When set to
+      `True`, the design matrix excludes the intercept, but its subterms are
+      chosen as if the intercept were included, even if it reduces the rank
+      of the resulting matrix.
     :returns: A list of :class:`DesignInfo` objects, one for each
       termlist passed in.
 
@@ -683,6 +694,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
     if isinstance(NA_action, str):
         NA_action = NAAction(NA_action)
     all_factors = set()
+    for termlist in termlists:
+        if implicit_intercept and INTERCEPT in termlist:
+            raise PatsyError("An intercept should not be included explicitly"
+                             "if implicit_intercept is set to True")
     for termlist in termlists:
         for term in termlist:
             all_factors.update(term.factors)
@@ -718,7 +733,8 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
     for termlist in termlists:
         term_to_subterm_infos = _make_subterm_infos(termlist,
                                                     num_column_counts,
-                                                    cat_levels_contrasts)
+                                                    cat_levels_contrasts,
+                                                    implicit_intercept)
         assert isinstance(term_to_subterm_infos, OrderedDict)
         assert frozenset(term_to_subterm_infos) == frozenset(termlist)
         this_design_factor_infos = {}

diff --git a/patsy/highlevel.py b/patsy/highlevel.py
@@ -31,7 +31,7 @@
 # data source. If formula_like is not capable of doing this, then returns
 # None.
 def _try_incr_builders(formula_like, data_iter_maker, eval_env,
-                       NA_action):
+                       NA_action, implicit_intercept):
     if isinstance(formula_like, DesignInfo):
         return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
                 formula_like)
@@ -67,11 +67,13 @@ def _try_incr_builders(formula_like, data_iter_maker, eval_env,
                                        formula_like.rhs_termlist],
                                       data_iter_maker,
                                       eval_env,
-                                      NA_action)
+                                      NA_action,
+                                      implicit_intercept)
     else:
         return None
 
-def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
+def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop",
+                  implicit_intercept=False):
     """Construct a design matrix builder incrementally from a large data set.
 
     :arg formula_like: Similar to :func:`dmatrix`, except that explicit
@@ -94,6 +96,11 @@ def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
     :arg NA_action: An :class:`NAAction` object or string, used to determine
       what values count as 'missing' for purposes of determining the levels of
       categorical factors.
+    :arg implicit_intercept: Boolean value, default `False`.  When set to
+      `True`, no intercept is included in the :class:`DesignInfo` that is
+      returned, but other columns are chosen as if an intercept were included,
+      even if it reduces the rank of design matrices built from the
+      :class:`DesignInfo`.
     :returns: A :class:`DesignInfo`
 
     Tip: for `data_iter_maker`, write a generator like::
@@ -109,7 +116,7 @@ def iter_maker():
     """
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
-                                      NA_action)
+                                      NA_action, implicit_intercept)
     if design_infos is None:
         raise PatsyError("bad formula-like object")
     if len(design_infos[0].column_names) > 0:
@@ -118,7 +125,7 @@ def iter_maker():
     return design_infos[1]
 
 def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
-                   NA_action="drop"):
+                   NA_action="drop", implicit_intercept=False):
     """Construct two design matrix builders incrementally from a large data
     set.
 
@@ -127,7 +134,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
     """
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
-                                      NA_action)
+                                      NA_action, implicit_intercept)
     if design_infos is None:
         raise PatsyError("bad formula-like object")
     if len(design_infos[0].column_names) == 0:
@@ -152,7 +159,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
 #   (DesignInfo, DesignInfo)
 #   any object with a special method __patsy_get_model_desc__
 def _do_highlevel_design(formula_like, data, eval_env,
-                         NA_action, return_type):
+                         NA_action, return_type, implicit_intercept):
     if return_type == "dataframe" and not have_pandas:
         raise PatsyError("pandas.DataFrame was requested, but pandas "
                             "is not installed")
@@ -162,7 +169,7 @@ def _do_highlevel_design(formula_like, data, eval_env,
     def data_iter_maker():
         return iter([data])
     design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
-                                      NA_action)
+                                      NA_action, implicit_intercept)
     if design_infos is not None:
         return build_design_matrices(design_infos, data,
                                      NA_action=NA_action,
@@ -223,7 +230,7 @@ def _regularize_matrix(m, default_column_prefix):
         return (lhs, rhs)
 
 def dmatrix(formula_like, data={}, eval_env=0,
-            NA_action="drop", return_type="matrix"):
+            NA_action="drop", return_type="matrix", implicit_intercept=False):
     """Construct a single design matrix given a formula_like and data.
 
     :arg formula_like: An object that can be used to construct a design
@@ -243,6 +250,10 @@ def dmatrix(formula_like, data={}, eval_env=0,
       :class:`NAAction` object. See :class:`NAAction` for details on what
       values count as 'missing' (and how to alter this).
     :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
+    :arg implicit_intercept: Boolean value, default `False`.  When set to
+      `True`, no intercept is included in the design matrix, but other
+      columns are chosen as if an intercept were included, even if it
+      reduces the rank of the matrix.
 
     The `formula_like` can take a variety of forms. You can use any of the
     following:
@@ -288,14 +299,15 @@ def dmatrix(formula_like, data={}, eval_env=0,
     """
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
-                                      NA_action, return_type)
+                                      NA_action, return_type,
+                                      implicit_intercept)
     if lhs.shape[1] != 0:
         raise PatsyError("encountered outcome variables for a model "
                             "that does not expect them")
     return rhs
 
 def dmatrices(formula_like, data={}, eval_env=0,
-              NA_action="drop", return_type="matrix"):
+              NA_action="drop", return_type="matrix", implicit_intercept=False):
     """Construct two design matrices given a formula_like and data.
 
     This function is identical to :func:`dmatrix`, except that it requires
@@ -307,7 +319,7 @@ def dmatrices(formula_like, data={}, eval_env=0,
     """
     eval_env = EvalEnvironment.capture(eval_env, reference=1)
     (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
-                                      NA_action, return_type)
+                                      NA_action, return_type, implicit_intercept)
     if lhs.shape[1] == 0:
         raise PatsyError("model is missing required outcome variables")
     return (lhs, rhs)
diff --git a/patsy/redundancy.py b/patsy/redundancy.py
@@ -211,22 +211,29 @@ def t(given, expected):
 #   of each factor to this set in place. So the first time this routine is
 #   called, pass in an empty set, and then just keep passing the same set to
 #   any future calls.
+# 'skip_noncategorical_subterm' is a boolean value indicating whether subterms
+#   should be chosen so that the unique subterm with no categorical factors is
+#   not in their span, even if the resulting matrix is not full rank.  Setting
+#   skip_noncategorical_subterm to True is equivalent to including this subterm
+#   in used_subterms.
 # Returns: a list of dicts. Each dict maps from factors to booleans. The
 # coding for the given term should use a full-rank contrast for those factors
 # which map to True, a (n-1)-rank contrast for those factors which map to
 # False, and any factors which are not mentioned are numeric and should be
 # added back in. These dicts should add columns to the design matrix from left
 # to right.
-def pick_contrasts_for_term(term, numeric_factors, used_subterms):
+def pick_contrasts_for_term(term, numeric_factors, used_subterms,
+                            skip_noncategorical_subterm):
     categorical_factors = [f for f in term.factors if f not in numeric_factors]
     # Converts a term into an expanded list of subterms like:
     #   a:b  ->  1 + a- + b- + a-:b-
     # and discards the ones that have already been used.
     subterms = []
     for subset in _subsets_sorted(categorical_factors):
-        subterm = _Subterm([_ExpandedFactor(False, f) for f in subset])
-        if subterm not in used_subterms:
-            subterms.append(subterm)
+        if subset or not skip_noncategorical_subterm:
+            subterm = _Subterm([_ExpandedFactor(False, f) for f in subset])
+            if subterm not in used_subterms:
+                subterms.append(subterm)
     used_subterms.update(subterms)
     _simplify_subterms(subterms)
     factor_codings = []
@@ -239,17 +246,28 @@ def pick_contrasts_for_term(term, numeric_factors, used_subterms):
 
 def test_pick_contrasts_for_term():
     from patsy.desc import Term
+    # Test with skip_noncategorical_subterm=False
     used = set()
-    codings = pick_contrasts_for_term(Term([]), set(), used)
+    codings = pick_contrasts_for_term(Term([]), set(), used, False)
     assert codings == [{}]
-    codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used)
+    codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, False)
     assert codings == [{"a": False}]
-    codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used)
+    codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, False)
     assert codings == [{"a": True, "b": False}]
     used_snapshot = set(used)
-    codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used)
+    codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used, False)
     assert codings == [{"d": False}, {"c": False, "d": True}]
     # Do it again backwards, to make sure we're deterministic with respect to
     # order:
-    codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot)
+    codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot,
+                                      False)
     assert codings == [{"c": False}, {"c": True, "d": False}]
+
+    # Test with skip_noncategorical_subterm=True
+    used = set()
+    codings = pick_contrasts_for_term(Term(["a"]), set(), used, True)
+    assert codings == [{"a": False}]
+    codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, True)
+    assert codings == []
+    codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, True)
+    assert codings == [{"b": False}]
diff --git a/patsy/test_build.py b/patsy/test_build.py
@@ -263,6 +263,49 @@ def iter_maker():
                   build_design_matrices, [builder], data,
                   return_type="asdfsadf")
 
+def test_implicit_intercept():
+    data = balanced(c=2, d=2)
+    data["x"] = [1, 2, 3, 4]
+    def iter_maker():
+        yield data
+
+    # Test that when there is only a numeric term, it does not get excluded
+    builder = design_matrix_builders([make_termlist("x")], iter_maker, 0,
+                                     implicit_intercept=True)[0]
+    mat = build_design_matrices([builder],
+                                {"x": [10.0, 15.0, 20.0]})[0]
+    assert mat.shape == (3, 1)
+    assert np.array_equal(mat, [[10], [15], [20]])
+
+    # Test that when there are a numeric term and a categorical term, one of
+    # the categorical subterms is omitted
+    builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0,
+                                     implicit_intercept=True)[0]
+    mat = build_design_matrices([builder],
+                                {"x": [10.0, 15.0, 20.0],
+                                 "c": np.asarray(["c1", "c2", "c1"],
+                                                 dtype=object)})[0]
+    assert mat.shape == (3, 2)
+    assert np.array_equal(mat, [[0, 10], [1, 15], [0, 20]])
+
+    # Test that when there is a product of numeric and categorical factors,
+    # all subterms are included
+    builder = design_matrix_builders([make_termlist("xc")], iter_maker, 0,
+                                     implicit_intercept=True)[0]
+    mat = build_design_matrices([builder],
+                                {"x": [10.0, 15.0, 20.0],
+                                 "c": np.asarray(["c1", "c2", "c1"],
+                                                 dtype=object)})[0]
+    assert mat.shape == (3, 2)
+    assert np.array_equal(mat, [[10, 0], [0, 15], [20, 0]])
+
+    # Test that when there are a numeric term and a product of categorical
+    # factors, one of the categorical subterms is omitted
+    builder = design_matrix_builders([make_termlist("x", "cd")], iter_maker, 0,
+                                     implicit_intercept=True)[0]
+    mat = build_design_matrices([builder], data)[0]
+    assert mat.shape[1] == 4 # 3 categorical subterms, one numeric
+
 def test_NA_action():
     initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]}
     def iter_maker():

diff --git a/patsy/test_highlevel.py b/patsy/test_highlevel.py
@@ -660,6 +660,64 @@ def raise_patsy_error(x):
     else:
         assert False
 
+def test_dmatrix_implicit_intercept():
+    data = balanced(a=2, b=2)
+    data["x"] = [1, 2, 3, 4]
+    data["y"] = [2, 3, 4, 5]
+
+    # Test that in a formula with one simple categorical term, one of
+    # the term's levels is excluded
+    mat = dmatrix("0 + a", data=data, implicit_intercept=True)
+    assert np.allclose(mat, [[0],
+                             [0],
+                             [1],
+                             [1]])
+
+    # In a more complicated formula with a categorical term,
+    # the encoding still excludes one categorical level
+    mat = dmatrix("0 + a:b + x", data=data, return_type="dataframe",
+                  implicit_intercept=True)
+    assert mat.shape[1] == 4
+    assert np.allclose(mat[['b[T.b2]', 'a[T.a2]:b[b1]', 'a[T.a2]:b[b2]', 'x']],
+                       [[0, 0, 0, 1],
+                        [1, 0, 0, 2],
+                        [0, 1, 0, 3],
+                        [1, 0, 1, 4]])
+
+    # A term with numerical and categorical factors should get a full-rank
+    # coding
+    mat = dmatrix("0 + a:x", data=data, return_type="dataframe",
+                  implicit_intercept=True)
+    assert mat.shape[1] == 2
+    assert np.allclose(mat[['a[a1]:x', 'a[a2]:x']],
+                       [[1, 0],
+                        [2, 0],
+                        [0, 3],
+                        [0, 4]])
+
+    # When using implicit_intercept, users should pass a formula that
+    # explicitly specifies no intercept (e.g. "0 + a" instead of "a").
+    # This tests that dmatrix raises an error if a user fails to do this.
+    assert_raises(PatsyError, dmatrix, "a", data=data, implicit_intercept=True)
+
+    # Test that in a formula with one simple categorical term, one of
+    # the term's levels is excluded
+    lmat, rmat = dmatrices("y ~ 0 + a", data=data, implicit_intercept=True)
+    assert np.allclose(lmat, [[2],
+                              [3],
+                              [4],
+                              [5]])
+    assert np.allclose(rmat, [[0],
+                              [0],
+                              [1],
+                              [1]])
+
+    # When using implicit_intercept, users should pass a formula that
+    # explicitly specifies no intercept (e.g. "0 + a" instead of "a").
+    # This tests that dmatrices raises an error if a user fails to do this.
+    assert_raises(PatsyError, dmatrices, "y ~ 1 + a", data=data,
+                  implicit_intercept=True)
+
 def test_dmatrix_NA_action():
     data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]}