forked from nitinkaushik01/Data_Science_Bootcamp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Simple_Linear_Regression_using_Scikit_Learn_and_Pyspark.py
161 lines (91 loc) · 2.87 KB
/
Simple_Linear_Regression_using_Scikit_Learn_and_Pyspark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# coding: utf-8
# # Simple Linear Regression using Scikit Learn
# ### Simple Linear Regression Equation
#
#
# ### y = a0 + a1*X
# In[44]:
#Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# In[45]:
#Read Student Grades .csv file and divide the data into dependent and independent variables.
data = pd.read_csv('Student_Grades_Data.csv')
data.head()
# In[46]:
data.shape
# In[47]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 1].values
# In[48]:
X
# In[49]:
y
# In[50]:
#Split the data into training and test datasets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# In[51]:
y_test
# In[52]:
#Fit the Simple Linear Regression Model
from sklearn.linear_model import LinearRegression
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)
# In[53]:
#Print the
print(f'a0 = {LinReg.intercept_}')
print(f'a1 = {LinReg.coef_}')
# In[54]:
#Predicted grade scores from test dataset
y_predict = LinReg.predict(X_test)
y_predict
# In[55]:
#Actual grade scores from test dataset
y_test
# In[56]:
#Grades Vs Time to Study visualization on Training Data
plt.scatter(X_train, y_train, color='Blue')
plt.plot(X_train, LinReg.predict(X_train), color='Black')
plt.title('Grades Vs Time to Study (On Training Data)')
plt.xlabel('Time to Study')
plt.ylabel('Grades')
plt.show()
# In[57]:
#Grades Vs Time to Study visualization on Test Data
plt.scatter(X_test, y_test, color='Red')
plt.plot(X_train, LinReg.predict(X_train), color='Black')
plt.title('Grades Vs Time to Study (On Test Data)')
plt.xlabel('Time to Study')
plt.ylabel('Grades')
plt.show()
# In[66]:
#Predicting Grade of a student when he studied for 10 Hrs. Example of how to pass an external value,
#Independent of Test or Training Dataset
Predict_Grade = LinReg.predict(10)
Predict_Grade
# In[59]:
#Model Evaluation using R-Square
from sklearn import metrics
r_square = metrics.r2_score(y_test, y_predict)
print('R-Square Error:', r_square)
# In[60]:
#For Illustration Purpose Only.
#Considering Multiple Linear Equation with two Variables : grade = a0 + a1*time_to_study + a2*class_participation
#Model Evaluation using Adjusted R-Square.
# Here n = no. of observations and p = no. of independent variables
n = 50
p = 2
Adj_r_square = 1-(1-r_square)*(n-1)/(n-p-1)
print('Adjusted R-Square Error:', Adj_r_square)
# In[61]:
#Model Evaluation using Mean Square Error (MSE)
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_predict))
# In[62]:
#Model Evaluation using Root Mean Square Error (RMSE)
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_predict)))
# In[63]:
#Model Evaluation using Mean Absolute Error (MAE)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_predict))
# In[ ]: