-
Notifications
You must be signed in to change notification settings - Fork 0
/
logisticRegression.py
87 lines (66 loc) · 3.08 KB
/
logisticRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 4 16:27:33 2019
Create logistic regression function from scratch.
@author: ryan.nanson
"""
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
# Create seperable simulated random data
np.random.seed(12)
num_observations = 5000
x1 = np.random.multivariate_normal([0, 0], [[1, .75],[.75, 1]], num_observations)
x2 = np.random.multivariate_normal([1, 4], [[1, .75],[.75, 1]], num_observations)
simulated_separableish_features = np.vstack((x1, x2)).astype(np.float32)
simulated_labels = np.hstack((np.zeros(num_observations),
np.ones(num_observations)))
# Take a look at the data
plt.figure(figsize=(12,8))
plt.scatter(simulated_separableish_features[:, 0], simulated_separableish_features[:, 1],
c = simulated_labels, alpha = .4)
# Link function in logistic regression is sigmoidal
def sigmoid(scores):
return 1 / (1 + np.exp(-scores))
# Calculate the log-likelihood
def log_likelihood(features, target, weights):
scores = np.dot(features, weights)
ll = np.sum( target*scores - np.log(1 + np.exp(scores)) )
return ll
# Logistic regession function
def logistic_regression(features, target, num_steps, learning_rate, add_intercept = False):
if add_intercept:
intercept = np.ones((features.shape[0], 1))
features = np.hstack((intercept, features))
weights = np.zeros(features.shape[1])
for step in range(num_steps):
scores = np.dot(features, weights)
predictions = sigmoid(scores)
# Update weights with gradient
output_error_signal = target - predictions
gradient = np.dot(features.T, output_error_signal)
weights += learning_rate * gradient
# Print log-likelihood every so often
if step % 10000 == 0:
print(log_likelihood(features, target, weights))
return weights
# Print out the weights
weights = logistic_regression(simulated_separableish_features, simulated_labels,
num_steps = 300000, learning_rate = 5e-5, add_intercept=True)
# Compare with sklearn's logistic regression
clf = LogisticRegression(fit_intercept=True, C = 1e15)
clf.fit(simulated_separableish_features, simulated_labels)
print(clf.intercept_, clf.coef_)
print(weights)
# Test the accuracy of both versions
data_with_intercept = np.hstack((np.ones((simulated_separableish_features.shape[0], 1)),
simulated_separableish_features))
final_scores = np.dot(data_with_intercept, weights)
preds = np.round(sigmoid(final_scores))
print ('Accuracy from scratch: {0}'.format((preds == simulated_labels).sum().astype(float) / len(preds)))
print ('Accuracy from sk-learn: {0}'.format(clf.score(simulated_separableish_features, simulated_labels)))
# Plot predictions - correct in one colour, incorrect in another
plt.figure(figsize = (12, 8))
plt.scatter(simulated_separableish_features[:, 0], simulated_separableish_features[:, 1],
c = preds == simulated_labels - 1, alpha = .8, s = 50)