Draft: Scenario 1 Sonja

FUB-HCC · Aug 10, 2021 · d511978 · d511978
1 parent 266a578
commit d511978
Show file tree

Hide file tree

Showing 7 changed files with 54,351 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+scenarios/sonja/twitter_sentiment_data.csv
+.gitignore
+.gitignore
diff --git a/scenarios/.DS_Store b/scenarios/.DS_Store
diff --git a/scenarios/sonja/.DS_Store b/scenarios/sonja/.DS_Store
diff --git a/scenarios/sonja/Generate_random _dates.ipynb b/scenarios/sonja/Generate_random _dates.ipynb
@@ -0,0 +1,197 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b068c96a-8c40-4354-a148-c9f4dfdc6d32",
+   "metadata": {},
+   "source": [
+    "Generate Random Dates\n",
+    "\n",
+    "The dataset of tweets we are using in the example is downloaded from https://www.kaggle.com/edqian/twitter-climate-change-sentiment-dataset \n",
+    "The tweets do not contain the date they were posted date.\n",
+    "To showcase how the development of interest in topics over time can be analyzed with the reflexive ML toolbox, we generate a random date for each tweet and save the dataset back to a csv-file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e54d8c12-530f-49f4-9d08-b6ab2d94c3ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import random\n",
+    "def getRandomDate():\n",
+    "    start_date = datetime.date(2019, 1, 1)\n",
+    "    end_date = datetime.date(2021, 1, 1)\n",
+    "\n",
+    "    time_between_dates = end_date - start_date\n",
+    "    days_between_dates = time_between_dates.days\n",
+    "    random_number_of_days = random.randrange(days_between_dates)\n",
+    "    random_date = start_date + datetime.timedelta(days=random_number_of_days)\n",
+    "    return random_date\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7882d22a-150a-41fc-aebd-ccc02378f792",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "data = pd.read_csv(\"twitter_sentiment_data.csv\", encoding=\"utf-8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "11179db5-def1-4821-9909-5c330d96cd63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data['publishedAt'] =''\n",
+    "data['publishedAt'] = data['publishedAt'].apply(lambda x: getRandomDate())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "aa1f0dc0-07d5-4539-868f-d1bd1c17c725",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sentiment</th>\n",
+       "      <th>message</th>\n",
+       "      <th>tweetid</th>\n",
+       "      <th>publishedAt</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-1</td>\n",
+       "      <td>@tiniebeany climate change is an interesting h...</td>\n",
+       "      <td>792927353886371840</td>\n",
+       "      <td>2019-10-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>RT @NatGeoChannel: Watch #BeforeTheFlood right...</td>\n",
+       "      <td>793124211518832641</td>\n",
+       "      <td>2019-12-27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Fabulous! Leonardo #DiCaprio's film on #climat...</td>\n",
+       "      <td>793124402388832256</td>\n",
+       "      <td>2019-08-13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>RT @Mick_Fanning: Just watched this amazing do...</td>\n",
+       "      <td>793124635873275904</td>\n",
+       "      <td>2019-08-09</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2</td>\n",
+       "      <td>RT @cnalive: Pranita Biswasi, a Lutheran from ...</td>\n",
+       "      <td>793125156185137153</td>\n",
+       "      <td>2020-05-12</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sentiment                                            message  \\\n",
+       "0         -1  @tiniebeany climate change is an interesting h...   \n",
+       "1          1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   \n",
+       "2          1  Fabulous! Leonardo #DiCaprio's film on #climat...   \n",
+       "3          1  RT @Mick_Fanning: Just watched this amazing do...   \n",
+       "4          2  RT @cnalive: Pranita Biswasi, a Lutheran from ...   \n",
+       "\n",
+       "              tweetid publishedAt  \n",
+       "0  792927353886371840  2019-10-20  \n",
+       "1  793124211518832641  2019-12-27  \n",
+       "2  793124402388832256  2019-08-13  \n",
+       "3  793124635873275904  2019-08-09  \n",
+       "4  793125156185137153  2020-05-12  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ca09b9d9-19c8-41d7-9fc3-96fe4a4658ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.to_csv('twitter_sentiment_data_dates.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "655a121e-689d-4b67-95f7-ce91f07e5f47",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}