Skip to content

Commit

Permalink
we need more data
Browse files Browse the repository at this point in the history
  • Loading branch information
Jaiveer Katariya committed Apr 5, 2021
1 parent 5b74790 commit 5c8fa8f
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 50 deletions.
Binary file added __pycache__/logreg.cpython-37.pyc
Binary file not shown.
Binary file added __pycache__/quantfeatures.cpython-37.pyc
Binary file not shown.
123 changes: 109 additions & 14 deletions backtester.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
# - return performance for that window - % of correct posts, average delta in value IF decisions were made according to the model (so this is going to require actual yfinance data)
# - do that for a bunch of different windows and average out performance

import yfinance as yf
import pandas as pd
from datetime import date, datetime, timedelta
import time
from logreg import trainAndTestFromDataframes


trainingWindow = 90 # let's say it's a 90 day window idk
Expand All @@ -13,13 +18,29 @@
csv = 'data/postsWithDate.csv'
df = pd.read_csv(csv)

# sort by date column
# get first and last dates for loop
lastDate = df['date'].max()
currentStartTrainDate = df['date'].min()
currentEndTrainDate = currentDateStart + timedelta(days=trainingWindow)

currentEndTrainDate = date.fromtimestamp(currentStartTrainDate) + timedelta(days=trainingWindow)
currentStartTestDate = currentEndTrainDate + timedelta(days=1)
currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow) # set it to its correct value now, instead of the placeholder
currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow)

# now make them into timestamps to compare them to what's in the dataframe
currentEndTrainDate = time.mktime(currentEndTrainDate.timetuple())
currentStartTestDate = time.mktime(currentStartTestDate.timetuple())
currentEndTestDate = time.mktime(currentEndTestDate.timetuple())


performanceDicts = []

print(date.fromtimestamp(currentEndTestDate).isoformat())
print(date.fromtimestamp(currentStartTrainDate).isoformat())
print(date.fromtimestamp(lastDate).isoformat())

while currentEndTestDate < lastDate:
print(f'Starting train window at {date.fromisoformat(currentStartTrainDate).isoformat()}')
performanceDict = {'appreciationOfBuys': [], 'appreciationOfHolds': [], 'depreciationOfSells': []}

# now let's filter for the correct dates in the dataframe
isInTrainWindow = currentStartTrainDate < df.date and df.date < currentEndTrainDate # replace this with whatever it needs to be
Expand All @@ -28,31 +49,105 @@
trainWindow = df[isInTrainWindow]
testWindow = df[isInTestWindow]

# assess models here
# get predictions from training over train window, testing over test window --> maybe add if statements for different models? doing log for now
testPredictions = trainAndTestFromDataframes(trainWindow, testWindow)

# now, for each buy, we want to track how much assets appreciated, and for each sell, we want to track how much assets depreciated


for i in range(len(testPredictions)):
appreciationPeriod = 10 # see how we did after 10 days, since we trained our model to look at changes in price after 10 days
currentRow = testWindow.iloc[i]
tickerName = currentRow['ticker']

ticker = yf.Ticker(tickerName)
maxHist = ticker.history(period='max')

datePosted = date.fromtimestamp(currentRow['date'])
datePostedString = datePosted.isoformat()

# isolate prices on day that post was made
isDatePosted = maxHist.index==datePostedString
histDatePosted = maxHist[isDatePosted]

while histDatePosted.empty:
datePosted = datePosted + timedelta(days=1) # we want to take action as soon as we can on the post -- but if markets closed on day post was made, have to wait
datePostedString = datePosted.isoformat()
isDatePosted = maxHist.index==datePostedString
histDatePosted = maxHist[isDatePosted]

y = trainWindow[:, 7] # signal is 8th column (so index 7)
X = np.trainWindow(data, 7, 1) # X will be everything else
datePostedPrice = histDatePosted['Open'].array[0]

scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
# now let's see how it changed:
appreciationDate = datePosted + timedelta(days=appreciationPeriod)
appreciationDateString = appreciationDate.isoformat()

# GET PREDICTIONS HERE
isAppreciationDate = maxHist.index==appreciationDateString
histAppreciationDate = maxHist[isAppreciationDate]

# NOW BASED ON PREDICTIONS GET WHAT ACTUALLY HAPPENED... NEED TICKER FOR THAT THOUGH
while histAppreciationDate.empty:
appreciationDate = appreciationDate + timedelta(days=1) # we want to take action as soon as we can on the post -- but if markets closed on day post was made, have to wait
appreciationDateString = appreciationDate.isoformat()
isAppreciationDate = maxHist.index==appreciationDateString
histAppreciationDate = maxHist[isAppreciationDate]

appreciationDatePrice = histAppreciationDate['Close'].array[0]

priceDifference = appreciationDatePrice - datePostedPrice
percentageChange = priceDifference / datePostedPrice


prediction = testPredictions[i]

# get performance of each prediction and add it to the performance dict
if prediction == 2:
performanceDict['appreciationOfBuys'].append(percentageChange)

# get new dates
dateIsCurrentStartDate = df.date == currentStartTrainDate
elif prediction == 1:
performanceDict['appreciationOfHolds'].append(percentageChange)

else:
performanceDict['depreciationOfSells'].append(percentageChange)

# now average out the values
for key in performanceDict:
totalPerformance = sum(performanceDict['key'])
averagePerformance = totalPerformance / len(performanceDict['key']) # divide sum by number of predictions that we assessed in that category
performanceDict[key] = averagePerformance

print(f'Performance dict was {performanceDict}')
print(f'Adding to list of performance dicts...')
performanceDicts.append(performanceDict)

# get new dates by first dropping all rows that were published on the same date as the row we're assessing
dateIsCurrentStartDate = date.fromtimestamp(df.date) == date.fromtimestamp(currentStartTrainDate)
df = df.drop(df[dateIsCurrentStartDate].index)

currentStartTrainDate = df['date'].min()
currentEndTrainDate = currentDateStart + timedelta(days=trainingWindow)

currentEndTrainDate = date.fromtimestamp(currentStartTrainDate) + timedelta(days=trainingWindow)
currentStartTestDate = currentEndTrainDate + timedelta(days=1)
currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow) # set it to its correct value now, instead of the placeholder
currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow)

# put em back into timestamp form so that we can use them in the filtering for the data
currentEndTrainDate = time.mktime(currentEndTrainDate.timetuple())
currentStartTestDate = time.mktime(currentStartTestDate.timetuple())
currentEndTestDate = time.mktime(currentEndTestDate.timetuple())


keys = performanceDicts[0].keys()
numPerformanceDicts = len(performanceDicts)

finalPerformance = {}

for key in keys:
finalPerformance[key] = 0

for i in range(numPerformanceDicts):
finalPerformance[key] = finalPerformance[key] + performanceDicts[i][key] # get sum of all values of this current key

finalPerformance[key] = finalPerformance[key]/numPerformanceDicts

print(f'Final performance: {finalPerformance}')


49 changes: 22 additions & 27 deletions logreg.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,41 @@
import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, confusion_matrix
from quantfeatures import dataToNumpy

csv = 'data/quantfeatures.csv'

if __name__ == '__main__':

# NOW -- HAVE THIS CALL THE FUNCTINOS FROM QUANTFEATURES AND GET X AND Y FROM THERE RATHER THAN RE GENERATING IT HERE

# df = pd.read_csv(csv)
# data = df.to_numpy()
# print(f'Dataframe columns are {df.columns}')

# data = np.delete(data, [0, 1, 2], 1) # delete columns for ticker, ID on axis 1

def getModelFromNumpy(X, y):
clf = LogisticRegression(max_iter = 1000)
clf.fit(X, y)
return clf

# y = data[:, 7] # signal is 8th column (so index 7)
# X = np.delete(data, 7, 1) # X will be everything else

def getModelFromDataframe(df): # method to be used in backtester
X_scaled,y = dataToNumpy(df)
return getModelFromNumpy(X_scaled, y)

# scaler = preprocessing.StandardScaler().fit(X)
# X_scaled = scaler.transform(X)
def getModelFromCSV(csv='data/postsWithDate.csv'):
df = pd.read_csv(csv)
getModelFromDataframe(df)

# # So columns are ['num_comments', 'score', 'upvote_ratio', 'ups', 'downs', 'is_locked',
# # 'is_self', 'compound_sentiment', 'selftext_length',
# # 'title_length', 'num_emojis', 'num_rockets', 'is_Gain', 'is_DD',
# # 'is_Discussion', 'is_News', 'is_Technical Analysis', 'is_Meme',
# # 'is_YOLO', 'is_Loss', 'is_Daily Discussion', 'is_Chart', 'is_Shitpost']

def trainAndTestFromDataframes(trainDf, testDf):
model = getModelFromDataframe(trainDf)
testX, testY = dataToNumpy(testDf)
predictions = model.predict(testX)
return predictions

# length = X_scaled.shape[0]
if __name__ == '__main__':

# # normalize the y
# y = y+1
# y=y.astype('int')
X_scaled, y = dataToNumpy('data/postsWithDate.csv')
clf = getModelFromNumpy(X_scaled, y)

length = X_scaled.shape[0]

clf = LogisticRegression(max_iter = 1000)
clf.fit(X_scaled, y)
# MANUAL TUNING DONE HERE
predictions = clf.predict(X_scaled)
predictions_proba = clf.predict_proba(X_scaled)

Expand Down Expand Up @@ -116,4 +112,3 @@
print(f"true two prediction zero average probabilities = {averageTrueTwoPredictionZeroProba / trueTwoPredictionZero}")
print(f"true two prediction two average probabilities = {averageTrueTwoPredictionTwoProba / 80}")

# number of emojis?
14 changes: 5 additions & 9 deletions quantfeatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import emoji
from sklearn import preprocessing


# we probably just care about compound sentiment -- other options are neg, neu, pos
Expand Down Expand Up @@ -55,12 +56,10 @@ def getNumEmojis(row, specificEmoji=""):



def dataToNumpy():
def dataToNumpy(df=pd.read_csv('data/postsWithDate.csv')):
nltk.downloader.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
csv = 'data/postsWithDate.csv'
csvOut = 'data/quantfeatures.csv'
df = pd.read_csv(csv)


# engineer additional features, starting with sentiment and lengths of selftext and title
df['compound_sentiment'] = df.apply(lambda row: getCompoundSentiment(row, sia), axis=1)
Expand All @@ -79,7 +78,7 @@ def dataToNumpy():

df = df.drop(columns=['selftext', 'title', 'is_distinguished', 'link_flair_text'])

data = df.to_numpy()
data = df.to_numpy()
print(f'Dataframe columns are {df.columns}')

data = np.delete(data, [0, 1, 2], 1) # delete columns for ticker, ID on axis 1
Expand All @@ -98,8 +97,6 @@ def dataToNumpy():
# 'is_YOLO', 'is_Loss', 'is_Daily Discussion', 'is_Chart', 'is_Shitpost']


length = X_scaled.shape[0]

# normalize the y
y = y+1
y=y.astype('int')
Expand All @@ -108,5 +105,4 @@ def dataToNumpy():
# calling the log reg file - basically make log reg only take in an X and a Y, and that way we can replicate the "logreg" file in the backtester since all it's doing is fitting a model and not any
# of the preprocessing -- that can all be handled here

return X, y

return X_scaled, y

0 comments on commit 5c8fa8f

Please sign in to comment.