Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support codings with "implicit intercept" #107

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions patsy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from patsy.util import (atleast_2d_column_default,
have_pandas, asarray_or_pandas,
safe_issubdtype)
from patsy.desc import INTERCEPT
from patsy.design_info import (DesignMatrix, DesignInfo,
FactorInfo, SubtermInfo)
from patsy.redundancy import pick_contrasts_for_term
Expand Down Expand Up @@ -570,7 +571,8 @@ def next(self):

def _make_subterm_infos(terms,
num_column_counts,
cat_levels_contrasts):
cat_levels_contrasts,
implicit_intercept):
# Sort each term into a bucket based on the set of numeric factors it
# contains:
term_buckets = OrderedDict()
Expand Down Expand Up @@ -601,9 +603,14 @@ def _make_subterm_infos(terms,
used_subterms = set()
for term in bucket_terms:
subterm_infos = []
# If this bucket is empty, the unique noncategorical subterm is the
# intercept, so it should be skipped in pick_contrasts_for_term
# if implicit_intercept is True.
skip_noncategorical_subterm = implicit_intercept and not bucket
factor_codings = pick_contrasts_for_term(term,
num_column_counts,
used_subterms)
used_subterms,
skip_noncategorical_subterm)
# Construct one SubtermInfo for each subterm
for factor_coding in factor_codings:
subterm_factors = []
Expand Down Expand Up @@ -636,7 +643,7 @@ def _make_subterm_infos(terms,
return term_to_subterm_infos

def design_matrix_builders(termlists, data_iter_maker, eval_env,
NA_action="drop"):
NA_action="drop", implicit_intercept=False):
"""Construct several :class:`DesignInfo` objects from termlists.

This is one of Patsy's fundamental functions. This function and
Expand All @@ -661,6 +668,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
:arg NA_action: An :class:`NAAction` object or string, used to determine
what values count as 'missing' for purposes of determining the levels of
categorical factors.
:arg implicit_intercept: A boolean value, default `False`. When set to
`True`, the design matrix excludes the intercept, but its subterms are
chosen as if the intercept were included, even if it reduces the rank
of the resulting matrix.
:returns: A list of :class:`DesignInfo` objects, one for each
termlist passed in.

Expand All @@ -683,6 +694,10 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
if isinstance(NA_action, str):
NA_action = NAAction(NA_action)
all_factors = set()
for termlist in termlists:
if implicit_intercept and INTERCEPT in termlist:
raise PatsyError("An intercept should not be included explicitly"
"if implicit_intercept is set to True")
for termlist in termlists:
for term in termlist:
all_factors.update(term.factors)
Expand Down Expand Up @@ -718,7 +733,8 @@ def design_matrix_builders(termlists, data_iter_maker, eval_env,
for termlist in termlists:
term_to_subterm_infos = _make_subterm_infos(termlist,
num_column_counts,
cat_levels_contrasts)
cat_levels_contrasts,
implicit_intercept)
assert isinstance(term_to_subterm_infos, OrderedDict)
assert frozenset(term_to_subterm_infos) == frozenset(termlist)
this_design_factor_infos = {}
Expand Down
36 changes: 24 additions & 12 deletions patsy/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
# data source. If formula_like is not capable of doing this, then returns
# None.
def _try_incr_builders(formula_like, data_iter_maker, eval_env,
NA_action):
NA_action, implicit_intercept):
if isinstance(formula_like, DesignInfo):
return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0],
formula_like)
Expand Down Expand Up @@ -67,11 +67,13 @@ def _try_incr_builders(formula_like, data_iter_maker, eval_env,
formula_like.rhs_termlist],
data_iter_maker,
eval_env,
NA_action)
NA_action,
implicit_intercept)
else:
return None

def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop",
implicit_intercept=False):
"""Construct a design matrix builder incrementally from a large data set.

:arg formula_like: Similar to :func:`dmatrix`, except that explicit
Expand All @@ -94,6 +96,11 @@ def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"):
:arg NA_action: An :class:`NAAction` object or string, used to determine
what values count as 'missing' for purposes of determining the levels of
categorical factors.
:arg implicit_intercept: Boolean value, default `False`. When set to
`True`, no intercept is included in the :class:`DesignInfo` that is
returned, but other columns are chosen as if an intercept were included,
even if it reduces the rank of design matrices built from the
:class:`DesignInfo`.
:returns: A :class:`DesignInfo`

Tip: for `data_iter_maker`, write a generator like::
Expand All @@ -109,7 +116,7 @@ def iter_maker():
"""
eval_env = EvalEnvironment.capture(eval_env, reference=1)
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
NA_action)
NA_action, implicit_intercept)
if design_infos is None:
raise PatsyError("bad formula-like object")
if len(design_infos[0].column_names) > 0:
Expand All @@ -118,7 +125,7 @@ def iter_maker():
return design_infos[1]

def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
NA_action="drop"):
NA_action="drop", implicit_intercept=False):
"""Construct two design matrix builders incrementally from a large data
set.

Expand All @@ -127,7 +134,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
"""
eval_env = EvalEnvironment.capture(eval_env, reference=1)
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
NA_action)
NA_action, implicit_intercept)
if design_infos is None:
raise PatsyError("bad formula-like object")
if len(design_infos[0].column_names) == 0:
Expand All @@ -152,7 +159,7 @@ def incr_dbuilders(formula_like, data_iter_maker, eval_env=0,
# (DesignInfo, DesignInfo)
# any object with a special method __patsy_get_model_desc__
def _do_highlevel_design(formula_like, data, eval_env,
NA_action, return_type):
NA_action, return_type, implicit_intercept):
if return_type == "dataframe" and not have_pandas:
raise PatsyError("pandas.DataFrame was requested, but pandas "
"is not installed")
Expand All @@ -162,7 +169,7 @@ def _do_highlevel_design(formula_like, data, eval_env,
def data_iter_maker():
return iter([data])
design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env,
NA_action)
NA_action, implicit_intercept)
if design_infos is not None:
return build_design_matrices(design_infos, data,
NA_action=NA_action,
Expand Down Expand Up @@ -223,7 +230,7 @@ def _regularize_matrix(m, default_column_prefix):
return (lhs, rhs)

def dmatrix(formula_like, data={}, eval_env=0,
NA_action="drop", return_type="matrix"):
NA_action="drop", return_type="matrix", implicit_intercept=False):
"""Construct a single design matrix given a formula_like and data.

:arg formula_like: An object that can be used to construct a design
Expand All @@ -243,6 +250,10 @@ def dmatrix(formula_like, data={}, eval_env=0,
:class:`NAAction` object. See :class:`NAAction` for details on what
values count as 'missing' (and how to alter this).
:arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
:arg implicit_intercept: Boolean value, default `False`. When set to
`True`, no intercept is included in the design matrix, but other
columns are chosen as if an intercept were included, even if it
reduces the rank of the matrix.

The `formula_like` can take a variety of forms. You can use any of the
following:
Expand Down Expand Up @@ -288,14 +299,15 @@ def dmatrix(formula_like, data={}, eval_env=0,
"""
eval_env = EvalEnvironment.capture(eval_env, reference=1)
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
NA_action, return_type)
NA_action, return_type,
implicit_intercept)
if lhs.shape[1] != 0:
raise PatsyError("encountered outcome variables for a model "
"that does not expect them")
return rhs

def dmatrices(formula_like, data={}, eval_env=0,
NA_action="drop", return_type="matrix"):
NA_action="drop", return_type="matrix", implicit_intercept=False):
"""Construct two design matrices given a formula_like and data.

This function is identical to :func:`dmatrix`, except that it requires
Expand All @@ -307,7 +319,7 @@ def dmatrices(formula_like, data={}, eval_env=0,
"""
eval_env = EvalEnvironment.capture(eval_env, reference=1)
(lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env,
NA_action, return_type)
NA_action, return_type, implicit_intercept)
if lhs.shape[1] == 0:
raise PatsyError("model is missing required outcome variables")
return (lhs, rhs)
36 changes: 27 additions & 9 deletions patsy/redundancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,22 +211,29 @@ def t(given, expected):
# of each factor to this set in place. So the first time this routine is
# called, pass in an empty set, and then just keep passing the same set to
# any future calls.
# 'skip_noncategorical_subterm' is a boolean value indicating whether subterms
# should be chosen so that the unique subterm with no categorical factors is
# not in their span, even if the resulting matrix is not full rank. Setting
# skip_noncategorical_subterm to True is equivalent to including this subterm
# in used_subterms.
# Returns: a list of dicts. Each dict maps from factors to booleans. The
# coding for the given term should use a full-rank contrast for those factors
# which map to True, a (n-1)-rank contrast for those factors which map to
# False, and any factors which are not mentioned are numeric and should be
# added back in. These dicts should add columns to the design matrix from left
# to right.
def pick_contrasts_for_term(term, numeric_factors, used_subterms):
def pick_contrasts_for_term(term, numeric_factors, used_subterms,
skip_noncategorical_subterm):
categorical_factors = [f for f in term.factors if f not in numeric_factors]
# Converts a term into an expanded list of subterms like:
# a:b -> 1 + a- + b- + a-:b-
# and discards the ones that have already been used.
subterms = []
for subset in _subsets_sorted(categorical_factors):
subterm = _Subterm([_ExpandedFactor(False, f) for f in subset])
if subterm not in used_subterms:
subterms.append(subterm)
if subset or not skip_noncategorical_subterm:
subterm = _Subterm([_ExpandedFactor(False, f) for f in subset])
if subterm not in used_subterms:
subterms.append(subterm)
used_subterms.update(subterms)
_simplify_subterms(subterms)
factor_codings = []
Expand All @@ -239,17 +246,28 @@ def pick_contrasts_for_term(term, numeric_factors, used_subterms):

def test_pick_contrasts_for_term():
from patsy.desc import Term
# Test with skip_noncategorical_subterm=False
used = set()
codings = pick_contrasts_for_term(Term([]), set(), used)
codings = pick_contrasts_for_term(Term([]), set(), used, False)
assert codings == [{}]
codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used)
codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, False)
assert codings == [{"a": False}]
codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used)
codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, False)
assert codings == [{"a": True, "b": False}]
used_snapshot = set(used)
codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used)
codings = pick_contrasts_for_term(Term(["c", "d"]), set(), used, False)
assert codings == [{"d": False}, {"c": False, "d": True}]
# Do it again backwards, to make sure we're deterministic with respect to
# order:
codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot)
codings = pick_contrasts_for_term(Term(["d", "c"]), set(), used_snapshot,
False)
assert codings == [{"c": False}, {"c": True, "d": False}]

# Test with skip_noncategorical_subterm=True
used = set()
codings = pick_contrasts_for_term(Term(["a"]), set(), used, True)
assert codings == [{"a": False}]
codings = pick_contrasts_for_term(Term(["a", "x"]), set(["x"]), used, True)
assert codings == []
codings = pick_contrasts_for_term(Term(["a", "b"]), set(), used, True)
assert codings == [{"b": False}]
43 changes: 43 additions & 0 deletions patsy/test_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,49 @@ def iter_maker():
build_design_matrices, [builder], data,
return_type="asdfsadf")

def test_implicit_intercept():
data = balanced(c=2, d=2)
data["x"] = [1, 2, 3, 4]
def iter_maker():
yield data

# Test that when there is only a numeric term, it does not get excluded
builder = design_matrix_builders([make_termlist("x")], iter_maker, 0,
implicit_intercept=True)[0]
mat = build_design_matrices([builder],
{"x": [10.0, 15.0, 20.0]})[0]
assert mat.shape == (3, 1)
assert np.array_equal(mat, [[10], [15], [20]])

# Test that when there are a numeric term and a categorical term, one of
# the categorical subterms is omitted
builder = design_matrix_builders([make_termlist("x", "c")], iter_maker, 0,
implicit_intercept=True)[0]
mat = build_design_matrices([builder],
{"x": [10.0, 15.0, 20.0],
"c": np.asarray(["c1", "c2", "c1"],
dtype=object)})[0]
assert mat.shape == (3, 2)
assert np.array_equal(mat, [[0, 10], [1, 15], [0, 20]])

# Test that when there is a product of numeric and categorical factors,
# all subterms are included
builder = design_matrix_builders([make_termlist("xc")], iter_maker, 0,
implicit_intercept=True)[0]
mat = build_design_matrices([builder],
{"x": [10.0, 15.0, 20.0],
"c": np.asarray(["c1", "c2", "c1"],
dtype=object)})[0]
assert mat.shape == (3, 2)
assert np.array_equal(mat, [[10, 0], [0, 15], [20, 0]])

# Test that when there are a numeric term and a product of categorical
# factors, one of the categorical subterms is omitted
builder = design_matrix_builders([make_termlist("x", "cd")], iter_maker, 0,
implicit_intercept=True)[0]
mat = build_design_matrices([builder], data)[0]
assert mat.shape[1] == 4 # 3 categorical subterms, one numeric

def test_NA_action():
initial_data = {"x": [1, 2, 3], "c": ["c1", "c2", "c1"]}
def iter_maker():
Expand Down
58 changes: 58 additions & 0 deletions patsy/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,64 @@ def raise_patsy_error(x):
else:
assert False

def test_dmatrix_implicit_intercept():
data = balanced(a=2, b=2)
data["x"] = [1, 2, 3, 4]
data["y"] = [2, 3, 4, 5]

# Test that in a formula with one simple categorical term, one of
# the term's levels is excluded
mat = dmatrix("0 + a", data=data, implicit_intercept=True)
assert np.allclose(mat, [[0],
[0],
[1],
[1]])

# In a more complicated formula with a categorical term,
# the encoding still excludes one categorical level
mat = dmatrix("0 + a:b + x", data=data, return_type="dataframe",
implicit_intercept=True)
assert mat.shape[1] == 4
assert np.allclose(mat[['b[T.b2]', 'a[T.a2]:b[b1]', 'a[T.a2]:b[b2]', 'x']],
[[0, 0, 0, 1],
[1, 0, 0, 2],
[0, 1, 0, 3],
[1, 0, 1, 4]])

# A term with numerical and categorical factors should get a full-rank
# coding
mat = dmatrix("0 + a:x", data=data, return_type="dataframe",
implicit_intercept=True)
assert mat.shape[1] == 2
assert np.allclose(mat[['a[a1]:x', 'a[a2]:x']],
[[1, 0],
[2, 0],
[0, 3],
[0, 4]])

# When using implicit_intercept, users should pass a formula that
# explicitly specifies no intercept (e.g. "0 + a" instead of "a").
# This tests that dmatrix raises an error if a user fails to do this.
assert_raises(PatsyError, dmatrix, "a", data=data, implicit_intercept=True)

# Test that in a formula with one simple categorical term, one of
# the term's levels is excluded
lmat, rmat = dmatrices("y ~ 0 + a", data=data, implicit_intercept=True)
assert np.allclose(lmat, [[2],
[3],
[4],
[5]])
assert np.allclose(rmat, [[0],
[0],
[1],
[1]])

# When using implicit_intercept, users should pass a formula that
# explicitly specifies no intercept (e.g. "0 + a" instead of "a").
# This tests that dmatrices raises an error if a user fails to do this.
assert_raises(PatsyError, dmatrices, "y ~ 1 + a", data=data,
implicit_intercept=True)

def test_dmatrix_NA_action():
data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]}

Expand Down