final2.txt

# IMPORT UTILITIES
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import statistics

featuresize = 735000
  
X = pd.read_csv(r'D:\Desktop\data.csv', low_memory=False)
X.dropna(inplace=True) #removes Nan/missing values
X = X.sample(n=featuresize) #to randomize
# syntax - DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)


XShuffle = shuffle(X)
XShuffle.describe()
XShuffle = pd.get_dummies(XShuffle)
y = np.array(XShuffle['SCORE'])  # WHAT WE WANT TO PREDICT AND CONVERTED TO ARRAY
yShape = y.shape
normalized_labels = normalize(y.reshape(1, -1), axis=1)  # NORMALIZE SCORE
y = normalized_labels.reshape(yShape)  # RESTORE SHAPE
XShuffle = XShuffle.drop('SCORE', axis=1)  # DROP SCORE FROM FEATURES
XShuffle = np.array(XShuffle)  # FEATURES TO ARRAY
    
train_features, test_features, train_labels, test_labels = train_test_split(XShuffle, y,test_size=0.2)

#Implementing LInear Regression

regressor= LinearRegression(n_jobs=-1)
regressor.fit(train_features,train_labels)
initialTime = time.time()
 

predictions = regressor.predict(test_features)
temp = np.corrcoef(test_labels, predictions).reshape(-1, )
rms = sqrt(mean_squared_error(test_labels, predictions))
tolerance = 0.0015
accuracy = (np.abs(predictions - test_labels) <= tolerance).mean()
    
print("Root Mean Square Error: ", round(rms, 8))
print("Tolerance is: ", tolerance)
print('Accuracy:', round(accuracy * 100, 2), '%.')
print("###################################")