we need more data

jaiveerk · Apr 5, 2021 · 5c8fa8f · 5c8fa8f
1 parent 5b74790
commit 5c8fa8f
Show file tree

Hide file tree

Showing 5 changed files with 136 additions and 50 deletions.
diff --git a/__pycache__/logreg.cpython-37.pyc b/__pycache__/logreg.cpython-37.pyc
diff --git a/__pycache__/quantfeatures.cpython-37.pyc b/__pycache__/quantfeatures.cpython-37.pyc
diff --git a/backtester.py b/backtester.py
@@ -5,6 +5,11 @@
 #   - return performance for that window - % of correct posts, average delta in value IF decisions were made according to the model (so this is going to require actual yfinance data)
 #   - do that for a bunch of different windows and average out performance
 
+import yfinance as yf
+import pandas as pd
+from datetime import date, datetime, timedelta
+import time
+from logreg import trainAndTestFromDataframes
 
 
 trainingWindow = 90 # let's say it's a 90 day window idk
@@ -13,13 +18,29 @@
 csv = 'data/postsWithDate.csv'
 df = pd.read_csv(csv)
 
-# sort by date column
+# get first and last dates for loop
+lastDate = df['date'].max()
 currentStartTrainDate = df['date'].min()
-currentEndTrainDate = currentDateStart + timedelta(days=trainingWindow)
+
+currentEndTrainDate = date.fromtimestamp(currentStartTrainDate) + timedelta(days=trainingWindow)
 currentStartTestDate = currentEndTrainDate + timedelta(days=1)
-currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow) # set it to its correct value now, instead of the placeholder
+currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow)
+
+# now make them into timestamps to compare them to what's in the dataframe
+currentEndTrainDate = time.mktime(currentEndTrainDate.timetuple())
+currentStartTestDate = time.mktime(currentStartTestDate.timetuple())
+currentEndTestDate = time.mktime(currentEndTestDate.timetuple())
+
+
+performanceDicts = []
+
+print(date.fromtimestamp(currentEndTestDate).isoformat())
+print(date.fromtimestamp(currentStartTrainDate).isoformat())
+print(date.fromtimestamp(lastDate).isoformat())
 
 while currentEndTestDate < lastDate:
+    print(f'Starting train window at {date.fromisoformat(currentStartTrainDate).isoformat()}')
+    performanceDict = {'appreciationOfBuys': [], 'appreciationOfHolds': [], 'depreciationOfSells': []}
 
     # now let's filter for the correct dates in the dataframe
     isInTrainWindow = currentStartTrainDate < df.date and df.date < currentEndTrainDate # replace this with whatever it needs to be
@@ -28,31 +49,105 @@
     trainWindow = df[isInTrainWindow]
     testWindow = df[isInTestWindow]
 
-    # assess models here
+    # get predictions from training over train window, testing over test window --> maybe add if statements for different models? doing log for now
+    testPredictions = trainAndTestFromDataframes(trainWindow, testWindow)
+
+    # now, for each buy, we want to track how much assets appreciated, and for each sell, we want to track how much assets depreciated
+
+
+    for i in range(len(testPredictions)):
+        appreciationPeriod = 10 # see how we did after 10 days, since we trained our model to look at changes in price after 10 days
+        currentRow = testWindow.iloc[i]
+        tickerName = currentRow['ticker']
+
+        ticker = yf.Ticker(tickerName)
+        maxHist = ticker.history(period='max')
+
+        datePosted = date.fromtimestamp(currentRow['date'])
+        datePostedString = datePosted.isoformat()
+
+        # isolate prices on day that post was made
+        isDatePosted = maxHist.index==datePostedString
+        histDatePosted = maxHist[isDatePosted]
+
+        while histDatePosted.empty:
+            datePosted = datePosted + timedelta(days=1) # we want to take action as soon as we can on the post -- but if markets closed on day post was made, have to wait
+            datePostedString = datePosted.isoformat()
+            isDatePosted = maxHist.index==datePostedString
+            histDatePosted = maxHist[isDatePosted]
 
-    y = trainWindow[:, 7] # signal is 8th column (so index 7)
-    X = np.trainWindow(data, 7, 1) # X will be everything else
+        datePostedPrice = histDatePosted['Open'].array[0]
 
-    scaler = preprocessing.StandardScaler().fit(X)
-    X_scaled = scaler.transform(X)
+        # now let's see how it changed:
+        appreciationDate = datePosted + timedelta(days=appreciationPeriod)
+        appreciationDateString = appreciationDate.isoformat()
 
-    # GET PREDICTIONS HERE
+        isAppreciationDate = maxHist.index==appreciationDateString
+        histAppreciationDate = maxHist[isAppreciationDate]
 
-    # NOW BASED ON PREDICTIONS GET WHAT ACTUALLY HAPPENED... NEED TICKER FOR THAT THOUGH
+        while histAppreciationDate.empty:
+            appreciationDate = appreciationDate + timedelta(days=1) # we want to take action as soon as we can on the post -- but if markets closed on day post was made, have to wait
+            appreciationDateString = appreciationDate.isoformat()
+            isAppreciationDate = maxHist.index==appreciationDateString
+            histAppreciationDate = maxHist[isAppreciationDate]
 
+        appreciationDatePrice = histAppreciationDate['Close'].array[0]
 
+        priceDifference = appreciationDatePrice - datePostedPrice
+        percentageChange = priceDifference / datePostedPrice
 
 
+        prediction = testPredictions[i]
 
+        # get performance of each prediction and add it to the performance dict
+        if prediction == 2:
+            performanceDict['appreciationOfBuys'].append(percentageChange)
 
-    # get new dates
-    dateIsCurrentStartDate = df.date == currentStartTrainDate
+        elif prediction == 1:
+            performanceDict['appreciationOfHolds'].append(percentageChange)
+
+        else:
+            performanceDict['depreciationOfSells'].append(percentageChange)
+
+    # now average out the values
+    for key in performanceDict:
+        totalPerformance = sum(performanceDict['key'])
+        averagePerformance = totalPerformance / len(performanceDict['key']) # divide sum by number of predictions that we assessed in that category
+        performanceDict[key] = averagePerformance
+
+    print(f'Performance dict was {performanceDict}')    
+    print(f'Adding to list of performance dicts...')
+    performanceDicts.append(performanceDict)
+
+    # get new dates by first dropping all rows that were published on the same date as the row we're assessing
+    dateIsCurrentStartDate = date.fromtimestamp(df.date) == date.fromtimestamp(currentStartTrainDate)
     df = df.drop(df[dateIsCurrentStartDate].index)
 
     currentStartTrainDate = df['date'].min()
-    currentEndTrainDate = currentDateStart + timedelta(days=trainingWindow)
+
+    currentEndTrainDate = date.fromtimestamp(currentStartTrainDate) + timedelta(days=trainingWindow)
     currentStartTestDate = currentEndTrainDate + timedelta(days=1)
-    currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow) # set it to its correct value now, instead of the placeholder
+    currentEndTestDate = currentStartTestDate + timedelta(days=testingWindow)
+
+    # put em back into timestamp form so that we can use them in the filtering for the data
+    currentEndTrainDate = time.mktime(currentEndTrainDate.timetuple())
+    currentStartTestDate = time.mktime(currentStartTestDate.timetuple())
+    currentEndTestDate = time.mktime(currentEndTestDate.timetuple())
+
+
+keys = performanceDicts[0].keys()
+numPerformanceDicts = len(performanceDicts)
+
+finalPerformance = {}
+
+for key in keys:
+    finalPerformance[key] = 0
+
+    for i in range(numPerformanceDicts):
+        finalPerformance[key] = finalPerformance[key] + performanceDicts[i][key] # get sum of all values of this current key
+
+    finalPerformance[key] = finalPerformance[key]/numPerformanceDicts
 
+print(f'Final performance: {finalPerformance}')
 
 
diff --git a/logreg.py b/logreg.py
@@ -1,45 +1,41 @@
 import pandas as pd
 import numpy as np
-from sklearn import preprocessing
+
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import log_loss, confusion_matrix
+from quantfeatures import dataToNumpy
 
 csv = 'data/quantfeatures.csv'
 
-if __name__ == '__main__':
-
-# NOW -- HAVE THIS CALL THE FUNCTINOS FROM QUANTFEATURES AND GET X AND Y FROM THERE RATHER THAN RE GENERATING IT HERE
-
-    # df = pd.read_csv(csv)
-    # data = df.to_numpy()
-    # print(f'Dataframe columns are {df.columns}')
-
-    # data = np.delete(data, [0, 1, 2], 1) # delete columns for ticker, ID on axis 1 
-
+def getModelFromNumpy(X, y):
+    clf = LogisticRegression(max_iter = 1000)
+    clf.fit(X, y)
+    return clf
 
-    # y = data[:, 7] # signal is 8th column (so index 7)
-    # X = np.delete(data, 7, 1) # X will be everything else
 
+def getModelFromDataframe(df): # method to be used in backtester
+    X_scaled,y = dataToNumpy(df)
+    return getModelFromNumpy(X_scaled, y)
 
-    # scaler = preprocessing.StandardScaler().fit(X)
-    # X_scaled = scaler.transform(X)
+def getModelFromCSV(csv='data/postsWithDate.csv'): 
+    df = pd.read_csv(csv)
+    getModelFromDataframe(df)
 
-    # # So columns are ['num_comments', 'score', 'upvote_ratio', 'ups', 'downs', 'is_locked',
-    # # 'is_self', 'compound_sentiment', 'selftext_length',
-    # # 'title_length', 'num_emojis', 'num_rockets', 'is_Gain', 'is_DD',
-    # # 'is_Discussion', 'is_News', 'is_Technical Analysis', 'is_Meme',
-    # # 'is_YOLO', 'is_Loss', 'is_Daily Discussion', 'is_Chart', 'is_Shitpost']
 
+def trainAndTestFromDataframes(trainDf, testDf):
+    model = getModelFromDataframe(trainDf)
+    testX, testY = dataToNumpy(testDf)
+    predictions = model.predict(testX)
+    return predictions
 
-    # length = X_scaled.shape[0]
+if __name__ == '__main__':
 
-    # # normalize the y
-    # y = y+1
-    # y=y.astype('int')
+    X_scaled, y = dataToNumpy('data/postsWithDate.csv')
+    clf = getModelFromNumpy(X_scaled, y)
 
+    length = X_scaled.shape[0]
 
-    clf = LogisticRegression(max_iter = 1000)
-    clf.fit(X_scaled, y)
+    # MANUAL TUNING DONE HERE
     predictions = clf.predict(X_scaled)
     predictions_proba = clf.predict_proba(X_scaled)
 
@@ -116,4 +112,3 @@
     print(f"true two prediction zero average probabilities = {averageTrueTwoPredictionZeroProba / trueTwoPredictionZero}")
     print(f"true two prediction two average probabilities = {averageTrueTwoPredictionTwoProba / 80}")
 
-    # number of emojis?
diff --git a/quantfeatures.py b/quantfeatures.py
@@ -4,6 +4,7 @@
 import nltk
 from nltk.sentiment import SentimentIntensityAnalyzer
 import emoji
+from sklearn import preprocessing
 
 
 # we probably just care about compound sentiment -- other options are neg, neu, pos
@@ -55,12 +56,10 @@ def getNumEmojis(row, specificEmoji=""):
 
 
 
-def dataToNumpy():
+def dataToNumpy(df=pd.read_csv('data/postsWithDate.csv')):
     nltk.downloader.download('vader_lexicon')
     sia = SentimentIntensityAnalyzer()
-    csv = 'data/postsWithDate.csv'
-    csvOut = 'data/quantfeatures.csv'
-    df = pd.read_csv(csv)
+
 
     # engineer additional features, starting with sentiment and lengths of selftext and title
     df['compound_sentiment'] = df.apply(lambda row: getCompoundSentiment(row, sia), axis=1)
@@ -79,7 +78,7 @@ def dataToNumpy():
 
     df = df.drop(columns=['selftext', 'title', 'is_distinguished', 'link_flair_text'])
 
-        data = df.to_numpy()
+    data = df.to_numpy()
     print(f'Dataframe columns are {df.columns}')
 
     data = np.delete(data, [0, 1, 2], 1) # delete columns for ticker, ID on axis 1 
@@ -98,8 +97,6 @@ def dataToNumpy():
     # 'is_YOLO', 'is_Loss', 'is_Daily Discussion', 'is_Chart', 'is_Shitpost']
 
 
-    length = X_scaled.shape[0]
-
     # normalize the y
     y = y+1
     y=y.astype('int')
@@ -108,5 +105,4 @@ def dataToNumpy():
     # calling the log reg file - basically make log reg only take in an X and a Y, and that way we can replicate the "logreg" file in the backtester since all it's doing is fitting a model and not any
     # of the preprocessing -- that can all be handled here
 
-    return X, y
-
+    return X_scaled, y