diff --git a/jupyter/Collaborative Filtering.ipynb b/jupyter/Collaborative Filtering.ipynb index c6036e6..f3ee8e3 100755 --- a/jupyter/Collaborative Filtering.ipynb +++ b/jupyter/Collaborative Filtering.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -75,14 +75,6 @@ " pandas.reset_option('display.max_rows')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DataStax Enterprise Analytics\n", - "" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -94,12 +86,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Connect to DSE Analytics Cluster" + "### Connect to Cassandra" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -118,20 +110,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.execute(\"\"\"\n", " CREATE KEYSPACE IF NOT EXISTS accelerate \n", @@ -149,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -165,20 +146,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS jokes \\\n", " (userid int, jokeid int, rating float, \\\n", @@ -205,12 +175,12 @@ "* This is a file I created from the *.dat file and I only have 10,000 rows -- dataset has over 1 million rows\n", "\n", "\n", - "#### Insert all the Joke Rating Data into the DSE table `jokes`" + "#### Insert all the Joke Rating Data into the table `jokes`" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -235,27 +205,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100 5 -0.875\n", - "100 7 9.906000137329102\n", - "100 8 -0.843999981880188\n", - "100 13 8.937999725341797\n", - "100 15 -0.968999981880188\n", - "100 16 -9.75\n", - "100 17 9.593999862670898\n" - ] - } - ], - "source": [ - "query = 'SELECT * FROM jokes WHERE userid = 100'\n", + "outputs": [], + "source": [ + "query = 'SELECT * FROM jokes WHERE userid = 65'\n", "rows = session.execute(query)\n", "for row in rows:\n", " print (row.userid, row.jokeid, row.rating)" @@ -265,7 +221,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Machine Learning with DSE Analytics and Apache Spark\n", "" ] }, @@ -280,23 +235,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create a spark session that is connected to DSE. From there load each table into a Spark Dataframe and take a count of the number of rows in each." + "#### Create a spark session that is connected to Cassandra. From there load each table into a Spark Dataframe and take a count of the number of rows in each." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table Row Count: \n", - "10000\n" - ] - } - ], + "outputs": [], "source": [ "spark = SparkSession.builder.appName('demo').master(\"local\").getOrCreate()\n", "\n", @@ -306,22 +252,6 @@ "print (jokeTable.count())" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### CFilter with PySpark requires that the ratings not be double/foat but int" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "#joke_df = jokeTable.withColumn(\"rating\", jokeTable.rating.cast('int'))" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -331,118 +261,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridjokeidrating
0150
117-9
218-9
3113-6
41150
5117-9
6118-7
7119-8
8120-9
9121-7
\n", - "
" - ], - "text/plain": [ - " userid jokeid rating\n", - "0 1 5 0 \n", - "1 1 7 -9 \n", - "2 1 8 -9 \n", - "3 1 13 -6 \n", - "4 1 15 0 \n", - "5 1 17 -9 \n", - "6 1 18 -7 \n", - "7 1 19 -8 \n", - "8 1 20 -9 \n", - "9 1 21 -7 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "(training, test) = jokeTable.randomSplit([0.8, 0.2])\n", "\n", @@ -463,7 +284,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -475,119 +296,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridrecommendations
0148[(44, 34.13772964477539), (51, 25.700042724609375), (125, 24.875423431396484), (146, 24.236581802368164), (69, 23.100719451904297), (143, 23.069561004638672), (119, 22.743892669677734), (65, 21.494281768798828), (138, 20.9459228515625), (26, 20.843549728393555)]
1243[(98, 16.153411865234375), (79, 15.212631225585938), (101, 13.930389404296875), (57, 12.906051635742188), (58, 12.793245315551758), (24, 9.772829055786133), (85, 9.358484268188477), (13, 8.649678230285645), (23, 7.858341693878174), (37, 7.614729404449463)]
2251[(55, 4.414163112640381), (52, 4.053103446960449), (124, 3.2235026359558105), (133, 1.8868601322174072), (141, 1.1414566040039062), (120, 0.8265798687934875), (122, 0.6061234474182129), (44, 0.4783670902252197), (114, 0.1717315912246704), (82, -0.7522410750389099)]
385[(40, 93.38774108886719), (48, 70.61991882324219), (90, 61.215755462646484), (86, 56.27168655395508), (114, 55.27031326293945), (43, 53.20564651489258), (60, 47.55717468261719), (100, 42.91171646118164), (33, 42.73453140258789), (104, 42.418277740478516)]
4137[(37, 11.013225555419922), (100, 9.195761680603027), (15, 9.175329208374023), (7, 8.999856948852539), (8, 8.987367630004883), (17, 8.032954216003418), (78, 7.779767036437988), (20, 7.432913780212402), (57, 7.157354354858398), (13, 7.101101875305176)]
565[(94, 13.413860321044922), (82, 11.760915756225586), (80, 11.44350528717041), (116, 11.285964012145996), (63, 11.251116752624512), (26, 11.184839248657227), (99, 11.051153182983398), (43, 10.730409622192383), (75, 10.707722663879395), (92, 10.706671714782715)]
653[(114, 13.702998161315918), (116, 13.296106338500977), (117, 12.910265922546387), (80, 11.056073188781738), (105, 9.647844314575195), (148, 9.136292457580566), (91, 8.573720932006836), (60, 8.479909896850586), (55, 8.326761245727539), (52, 8.317741394042969)]
7133[(35, 8.389063835144043), (94, 8.131109237670898), (32, 8.074833869934082), (121, 7.820661544799805), (53, 7.75675106048584), (89, 7.586538314819336), (119, 6.810390949249268), (46, 6.713859558105469), (69, 6.705226421356201), (81, 6.654937744140625)]
8155[(46, 43.723426818847656), (35, 38.590579986572266), (119, 37.78889846801758), (53, 37.23609161376953), (26, 36.63851547241211), (69, 36.21256637573242), (34, 35.53752899169922), (21, 34.712867736816406), (32, 33.222694396972656), (125, 31.87568473815918)]
9108[(75, 37.769920349121094), (40, 33.574073791503906), (102, 28.94316864013672), (114, 26.217697143554688), (43, 25.416234970092773), (124, 25.390714645385742), (100, 24.526344299316406), (48, 24.486526489257812), (81, 23.37546157836914), (86, 23.240493774414062)]
\n", - "
" - ], - "text/plain": [ - " userid \\\n", - "0 148 \n", - "1 243 \n", - "2 251 \n", - "3 85 \n", - "4 137 \n", - "5 65 \n", - "6 53 \n", - "7 133 \n", - "8 155 \n", - "9 108 \n", - "\n", - " recommendations \n", - "0 [(44, 34.13772964477539), (51, 25.700042724609375), (125, 24.875423431396484), (146, 24.236581802368164), (69, 23.100719451904297), (143, 23.069561004638672), (119, 22.743892669677734), (65, 21.494281768798828), (138, 20.9459228515625), (26, 20.843549728393555)] \n", - "1 [(98, 16.153411865234375), (79, 15.212631225585938), (101, 13.930389404296875), (57, 12.906051635742188), (58, 12.793245315551758), (24, 9.772829055786133), (85, 9.358484268188477), (13, 8.649678230285645), (23, 7.858341693878174), (37, 7.614729404449463)] \n", - "2 [(55, 4.414163112640381), (52, 4.053103446960449), (124, 3.2235026359558105), (133, 1.8868601322174072), (141, 1.1414566040039062), (120, 0.8265798687934875), (122, 0.6061234474182129), (44, 0.4783670902252197), (114, 0.1717315912246704), (82, -0.7522410750389099)] \n", - "3 [(40, 93.38774108886719), (48, 70.61991882324219), (90, 61.215755462646484), (86, 56.27168655395508), (114, 55.27031326293945), (43, 53.20564651489258), (60, 47.55717468261719), (100, 42.91171646118164), (33, 42.73453140258789), (104, 42.418277740478516)] \n", - "4 [(37, 11.013225555419922), (100, 9.195761680603027), (15, 9.175329208374023), (7, 8.999856948852539), (8, 8.987367630004883), (17, 8.032954216003418), (78, 7.779767036437988), (20, 7.432913780212402), (57, 7.157354354858398), (13, 7.101101875305176)] \n", - "5 [(94, 13.413860321044922), (82, 11.760915756225586), (80, 11.44350528717041), (116, 11.285964012145996), (63, 11.251116752624512), (26, 11.184839248657227), (99, 11.051153182983398), (43, 10.730409622192383), (75, 10.707722663879395), (92, 10.706671714782715)] \n", - "6 [(114, 13.702998161315918), (116, 13.296106338500977), (117, 12.910265922546387), (80, 11.056073188781738), (105, 9.647844314575195), (148, 9.136292457580566), (91, 8.573720932006836), (60, 8.479909896850586), (55, 8.326761245727539), (52, 8.317741394042969)] \n", - "7 [(35, 8.389063835144043), (94, 8.131109237670898), (32, 8.074833869934082), (121, 7.820661544799805), (53, 7.75675106048584), (89, 7.586538314819336), (119, 6.810390949249268), (46, 6.713859558105469), (69, 6.705226421356201), (81, 6.654937744140625)] \n", - "8 [(46, 43.723426818847656), (35, 38.590579986572266), (119, 37.78889846801758), (53, 37.23609161376953), (26, 36.63851547241211), (69, 36.21256637573242), (34, 35.53752899169922), (21, 34.712867736816406), (32, 33.222694396972656), (125, 31.87568473815918)] \n", - "9 [(75, 37.769920349121094), (40, 33.574073791503906), (102, 28.94316864013672), (114, 26.217697143554688), (43, 25.416234970092773), (124, 25.390714645385742), (100, 24.526344299316406), (48, 24.486526489257812), (81, 23.37546157836914), (86, 23.240493774414062)] " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Evaluate the model by computing the RMSE on the test data\n", "predictions = model.transform(testing_df)\n", @@ -603,141 +314,30 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridrecommendations
065[(94, 13.413860321044922), (82, 11.760915756225586), (80, 11.44350528717041), (116, 11.285964012145996), (63, 11.251116752624512), (26, 11.184839248657227), (99, 11.051153182983398), (43, 10.730409622192383), (75, 10.707722663879395), (92, 10.706671714782715)]
\n", - "
" - ], - "text/plain": [ - " userid \\\n", - "0 65 \n", - "\n", - " recommendations \n", - "0 [(94, 13.413860321044922), (82, 11.760915756225586), (80, 11.44350528717041), (116, 11.285964012145996), (63, 11.251116752624512), (26, 11.184839248657227), (99, 11.051153182983398), (43, 10.730409622192383), (75, 10.707722663879395), (92, 10.706671714782715)] " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(userRecs.filter(userRecs.userid == 65))" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "IFrame(src='images/init94.html', width=700, height=200)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "IFrame(src='images/init43.html', width=700, height=200)" ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.execute(\"\"\"drop table jokes\"\"\")" - ] } ], "metadata": { @@ -756,7 +356,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/jupyter/FP-Growth.ipynb b/jupyter/FP-Growth.ipynb index b6bd61f..0a18b11 100755 --- a/jupyter/FP-Growth.ipynb +++ b/jupyter/FP-Growth.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -89,20 +89,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Connect to DSE Analytics Cluster" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DataStax Enterprise Analytics\n", "" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -121,20 +113,9 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "session.execute(\"\"\"\n", " CREATE KEYSPACE IF NOT EXISTS accelerate \n", @@ -151,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -167,20 +148,9 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS movies \\\n", " (movieid int, title text, genres text, \\\n", @@ -197,20 +167,9 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS movieratings \\\n", " (userid int, movieid int, rating float, timestamp text, \\\n", @@ -252,12 +211,12 @@ "### Load Movie datasets from CSV file (rating_movies.csv, movies.csv)\n", "* No clean up was requried! How nice :)\n", "\n", - "#### Insert all the Movie Data into the DSE table `movies` and `movieratings`" + "#### Insert all the Movie Data into the table `movies` and `movieratings`" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -293,7 +252,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Machine Learning with DSE Analytics and Apache Spark\n", + "## Machine Learning with Apache Spark\n", "" ] }, @@ -301,23 +260,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create a spark session that is connected to DSE. From there load each table into a Spark Dataframe and take a count of the number of rows in each." + "#### Create a spark session that is connected to cassandra. From there load each table into a Spark Dataframe and take a count of the number of rows in each." ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table Row Count: \n", - "100000\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "spark = SparkSession.builder.appName('demo').master(\"local\").getOrCreate()\n", "\n", @@ -329,89 +279,9 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridmovieidratingtimestamp
02314.012/24/98 0:13\\n
123254.012/24/98 0:07\\n
223325.012/24/98 0:12\\n
323475.012/24/98 0:06\\n
423505.012/24/98 0:02\\n
\n", - "
" - ], - "text/plain": [ - " userid movieid rating timestamp\n", - "0 23 1 4.0 12/24/98 0:13\\n\n", - "1 23 25 4.0 12/24/98 0:07\\n\n", - "2 23 32 5.0 12/24/98 0:12\\n\n", - "3 23 47 5.0 12/24/98 0:06\\n\n", - "4 23 50 5.0 12/24/98 0:02\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "showDF(movieDF)" ] @@ -432,83 +302,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridmovieidrating
02314.0
123254.0
223325.0
323475.0
423505.0
\n", - "
" - ], - "text/plain": [ - " userid movieid rating\n", - "0 23 1 4.0\n", - "1 23 25 4.0\n", - "2 23 32 5.0\n", - "3 23 47 5.0\n", - "4 23 50 5.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "newMovieDF = movieDF.drop('timestamp')\n", "showDF(newMovieDF)" @@ -523,83 +319,9 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useridmovieidrating
02314.0
123254.0
223325.0
323475.0
423505.0
\n", - "
" - ], - "text/plain": [ - " userid movieid rating\n", - "0 23 1 4.0\n", - "1 23 25 4.0\n", - "2 23 32 5.0\n", - "3 23 47 5.0\n", - "4 23 50 5.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "newestMovies = newMovieDF.filter(\"rating > 3\")\n", "showDF(newestMovies)" @@ -614,42 +336,9 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+--------------------+\n", - "|userid| moviesRated|\n", - "+------+--------------------+\n", - "| 463|[161, 509, 590, 2...|\n", - "| 148|[356, 4995, 539, ...|\n", - "| 471|[508, 356, 4008, ...|\n", - "| 496|[356, 1953, 1395,...|\n", - "| 623|[356, 165, 593, 5...|\n", - "| 243|[592, 356, 153, 1...|\n", - "| 392|[3254, 596, 4995,...|\n", - "| 540|[356, 70286, 5874...|\n", - "| 31|[54997, 45517, 70...|\n", - "| 516|[356, 785, 1345, ...|\n", - "| 137|[356, 3173, 1222,...|\n", - "| 251|[466, 356, 110, 3...|\n", - "| 85|[945, 916, 1293, ...|\n", - "| 580|[4306, 4973, 1223...|\n", - "| 451|[6440, 1222, 714,...|\n", - "| 458|[299, 2396, 3798,...|\n", - "| 65|[356, 40870, 7609...|\n", - "| 53|[1953, 1649, 1172...|\n", - "| 255|[2, 2006, 380, 20...|\n", - "| 588|[4995, 4343, 3450...|\n", - "+------+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "group_user = newestMovies.groupBy('userid').agg(collect_set('movieid').alias('moviesRated'))\n", "group_user.show()\n" @@ -664,77 +353,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
useriditems
0463[161, 509, 590, 277, 105, 410, 207, 25, 381, 5...
1148[356, 4995, 539, 916, 2340, 4285, 1680, 4062, ...
2471[508, 356, 4008, 1272, 8982, 3450, 2078, 2028,...
3496[356, 1953, 1395, 1476, 1222, 3421, 2028, 1266...
4623[356, 165, 593, 590, 318, 292, 50, 296, 380, 3...
\n", - "
" - ], - "text/plain": [ - " userid items\n", - "0 463 [161, 509, 590, 277, 105, 410, 207, 25, 381, 5...\n", - "1 148 [356, 4995, 539, 916, 2340, 4285, 1680, 4062, ...\n", - "2 471 [508, 356, 4008, 1272, 8982, 3450, 2078, 2028,...\n", - "3 496 [356, 1953, 1395, 1476, 1222, 3421, 2028, 1266...\n", - "4 623 [356, 165, 593, 590, 318, 292, 50, 296, 380, 3..." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df = group_user.withColumnRenamed(\"moviesRated\", \"items\")\n", "showDF(df)" @@ -742,20 +363,9 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "702" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df.select('userid').distinct().count()" ] @@ -772,42 +382,9 @@ }, { "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------+--------------------+--------------------+\n", - "|userid| items| prediction|\n", - "+------+--------------------+--------------------+\n", - "| 463|[161, 509, 590, 2...|[50, 356, 2858, 2...|\n", - "| 148|[356, 4995, 539, ...|[4993, 2959, 296,...|\n", - "| 471|[508, 356, 4008, ...|[296, 589, 2858, ...|\n", - "| 496|[356, 1953, 1395,...|[318, 593, 296, 2...|\n", - "| 623|[356, 165, 593, 5...|[2858, 47, 260, 5...|\n", - "| 243|[592, 356, 153, 1...|[593, 260, 527, 2...|\n", - "| 392|[3254, 596, 4995,...|[356, 527, 260, 5...|\n", - "| 540|[356, 70286, 5874...|[593, 318, 589, 4...|\n", - "| 31|[54997, 45517, 70...|[356, 527, 296, 5...|\n", - "| 516|[356, 785, 1345, ...|[318, 593, 2858, ...|\n", - "| 137|[356, 3173, 1222,...|[50, 47, 5952, 49...|\n", - "| 251|[466, 356, 110, 3...|[318, 593, 457, 2...|\n", - "| 85|[945, 916, 1293, ...|[2959, 296, 2571,...|\n", - "| 580|[4306, 4973, 1223...|[318, 356, 593, 5...|\n", - "| 451|[6440, 1222, 714,...|[1198, 260, 1196,...|\n", - "| 458|[299, 2396, 3798,...|[50, 296, 858, 60...|\n", - "| 65|[356, 40870, 7609...|[1198, 260, 296, ...|\n", - "| 53|[1953, 1649, 1172...|[296, 318, 593, 2...|\n", - "| 255|[2, 2006, 380, 20...|[150, 356, 457, 5...|\n", - "| 588|[4995, 4343, 3450...|[50, 296, 1221, 2...|\n", - "+------+--------------------+--------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fpGrowth = FPGrowth(itemsCol=\"items\", minSupport=0.1, minConfidence=0.2)\n", "model = fpGrowth.fit(df)\n", @@ -824,42 +401,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+------------+----------+------------------+------------------+\n", - "| antecedent|consequent| confidence| lift|\n", - "+------------+----------+------------------+------------------+\n", - "| [608, 593]| [296]|0.8301886792452831| 2.111566858080394|\n", - "| [608, 593]| [318]|0.7075471698113207| 1.793133982698726|\n", - "|[7153, 2959]| [4993]| 0.935064935064935| 4.405473720909962|\n", - "| [541]| [1198]|0.5658914728682171|2.2069767441860466|\n", - "| [541]| [260]|0.6124031007751938| 2.01834261382247|\n", - "| [541]| [296]|0.6744186046511628|1.7153690596562183|\n", - "| [541]| [593]|0.5581395348837209|1.6124031007751938|\n", - "| [541]| [1196]|0.5968992248062015| 2.289744567289363|\n", - "| [541]| [2571]|0.6589147286821705|2.1922186707814393|\n", - "| [527, 593]| [356]|0.6428571428571429|1.6714285714285715|\n", - "| [527, 593]| [296]|0.7053571428571429|1.7940605590062113|\n", - "| [527, 593]| [318]|0.7142857142857143|1.8102114492006187|\n", - "| [2028, 296]| [2858]| 0.782608695652174|2.7197589324149805|\n", - "| [1196, 296]| [260]|0.8061224489795918|2.6567979304397817|\n", - "| [480, 593]| [356]|0.7916666666666666| 2.058333333333333|\n", - "| [858, 50]| [296]|0.8064516129032258|2.0511921458625526|\n", - "| [858, 50]| [318]|0.7741935483870968| 1.962035635262606|\n", - "| [1221, 296]| [858]| 0.961038961038961|3.8332349468713103|\n", - "|[4226, 2959]| [4993]|0.6574074074074074| 3.097315436241611|\n", - "|[4226, 2959]| [2858]|0.7129629629629629|2.4777227722772275|\n", - "+------------+----------+------------------+------------------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Display frequent itemsets.\n", "#model.freqItemsets.show()\n", @@ -872,20 +416,9 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "If you like these movies: \n", - "[[608, 593]]\n", - "Then you will like this movie:\n", - "[[296]]\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"If you like these movies: \")\n", "print(list(dfAssociation.select('antecedent').first()))\n", @@ -900,25 +433,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Query DSE to get movie titles" + "#### Query database to get movie titles" ] }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Fargo (1996)\n", - "\n", - "\"Silence of the Lambs\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "query = \"select title from movies WHERE movieid=\"\n", "query = query + str(movieYoulike[0][0])\n", @@ -948,18 +470,9 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Pulp Fiction (1994)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "query = \"select title from movies WHERE movieid=\"\n", "query = query + str(movieToRecommend[0][0])\n", diff --git a/jupyter/Naivebayes.ipynb b/jupyter/Naivebayes.ipynb index 4d01b8c..226e5de 100755 --- a/jupyter/Naivebayes.ipynb +++ b/jupyter/Naivebayes.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -82,7 +82,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# DataStax Enterprise Analytics\n", "" ] }, @@ -97,12 +96,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Connect to DSE Analytics Cluster" + "### Connect to Cassandra" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -121,20 +120,11 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "session.execute(\"\"\"\n", " CREATE KEYSPACE IF NOT EXISTS accelerate \n", @@ -152,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -168,20 +158,9 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS wines \\\n", " (wineid int, fixedAcidity float, volatileAcidity float, citricAcid float, sugar float, \\\n", @@ -231,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -269,7 +248,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Machine Learning with DSE Analytics and Apache Spark\n", + "## Machine Learning with Apache Cassandra & Apache Spark\n", "" ] }, @@ -277,23 +256,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create a spark session that is connected to DSE. From there load each table into a Spark Dataframe and take a count of the number of rows in each." + "#### Create a spark session that is connected to the database. From there load each table into a Spark Dataframe and take a count of the number of rows in each." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table Wine Row Count: \n", - "6497\n" - ] - } - ], + "outputs": [], "source": [ "spark = SparkSession.builder.appName('demo').master(\"local\").getOrCreate()\n", "\n", @@ -305,150 +275,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidity
0431710.80.0460.290.995186.859.03.206.010.40.40143.00.16
1337210.90.0590.260.995507.832.03.046.09.50.43178.00.40
215849.80.0740.290.995786.232.03.335.02.10.6298.00.46
348309.40.0560.570.995486.760.02.966.06.60.43150.00.13
427319.70.0470.340.994406.924.03.206.04.00.52128.00.23
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 4317 10.8 0.046 0.29 0.99518 6.8 59.0 \n", - "1 3372 10.9 0.059 0.26 0.99550 7.8 32.0 \n", - "2 1584 9.8 0.074 0.29 0.99578 6.2 32.0 \n", - "3 4830 9.4 0.056 0.57 0.99548 6.7 60.0 \n", - "4 2731 9.7 0.047 0.34 0.99440 6.9 24.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \n", - "0 3.20 6.0 10.4 0.40 143.0 0.16 \n", - "1 3.04 6.0 9.5 0.43 178.0 0.40 \n", - "2 3.33 5.0 2.1 0.62 98.0 0.46 \n", - "3 2.96 6.0 6.6 0.43 150.0 0.13 \n", - "4 3.20 6.0 4.0 0.52 128.0 0.23 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(wineDF)" ] @@ -462,150 +291,9 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidity
0431710.80.0460.290.995186.859.03.206.010.40.40143.00.16
1337210.90.0590.260.995507.832.03.046.09.50.43178.00.40
248309.40.0560.570.995486.760.02.966.06.60.43150.00.13
327319.70.0470.340.994406.924.03.206.04.00.52128.00.23
47699.70.0820.020.997447.124.03.556.02.30.5394.00.59
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 4317 10.8 0.046 0.29 0.99518 6.8 59.0 \n", - "1 3372 10.9 0.059 0.26 0.99550 7.8 32.0 \n", - "2 4830 9.4 0.056 0.57 0.99548 6.7 60.0 \n", - "3 2731 9.7 0.047 0.34 0.99440 6.9 24.0 \n", - "4 769 9.7 0.082 0.02 0.99744 7.1 24.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \n", - "0 3.20 6.0 10.4 0.40 143.0 0.16 \n", - "1 3.04 6.0 9.5 0.43 178.0 0.40 \n", - "2 2.96 6.0 6.6 0.43 150.0 0.13 \n", - "3 3.20 6.0 4.0 0.52 128.0 0.23 \n", - "4 3.55 6.0 2.3 0.53 94.0 0.59 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "wine6DF = wineDF.filter(\"quality > 5\")\n", "showDF(wine6DF)" @@ -620,176 +308,9 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidityfeatureslabel
0431710.80.0460.290.995186.859.03.206.010.40.40143.00.16[10.800000190734863, 0.04600000008940697, 0.28...0.0
1337210.90.0590.260.995507.832.03.046.09.50.43178.00.40[10.899999618530273, 0.05900000035762787, 0.25...0.0
248309.40.0560.570.995486.760.02.966.06.60.43150.00.13[9.399999618530273, 0.0560000017285347, 0.5699...0.0
327319.70.0470.340.994406.924.03.206.04.00.52128.00.23[9.699999809265137, 0.04699999839067459, 0.340...0.0
47699.70.0820.020.997447.124.03.556.02.30.5394.00.59[9.699999809265137, 0.0820000022649765, 0.0199...0.0
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 4317 10.8 0.046 0.29 0.99518 6.8 59.0 \n", - "1 3372 10.9 0.059 0.26 0.99550 7.8 32.0 \n", - "2 4830 9.4 0.056 0.57 0.99548 6.7 60.0 \n", - "3 2731 9.7 0.047 0.34 0.99440 6.9 24.0 \n", - "4 769 9.7 0.082 0.02 0.99744 7.1 24.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \\\n", - "0 3.20 6.0 10.4 0.40 143.0 0.16 \n", - "1 3.04 6.0 9.5 0.43 178.0 0.40 \n", - "2 2.96 6.0 6.6 0.43 150.0 0.13 \n", - "3 3.20 6.0 4.0 0.52 128.0 0.23 \n", - "4 3.55 6.0 2.3 0.53 94.0 0.59 \n", - "\n", - " features label \n", - "0 [10.800000190734863, 0.04600000008940697, 0.28... 0.0 \n", - "1 [10.899999618530273, 0.05900000035762787, 0.25... 0.0 \n", - "2 [9.399999618530273, 0.0560000017285347, 0.5699... 0.0 \n", - "3 [9.699999809265137, 0.04699999839067459, 0.340... 0.0 \n", - "4 [9.699999809265137, 0.0820000022649765, 0.0199... 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4113\n" - ] - } - ], + "outputs": [], "source": [ "assembler = VectorAssembler(\n", " inputCols=['alcohol', 'chlorides', 'citricacid', 'density', 'fixedacidity', 'ph', 'freesulfur', 'sugar', 'sulphates', 'totalsulfur', 'volatileacidity'],\n", @@ -808,25 +329,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### We will be training a model with Naive Bays, and because of this we need to split up our dataset in to a training and test set. Will split 80/20. " + "We need to split up our dataset in to a training and test set. Will split 80/20. " ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Dataframe Row Count: \n", - "3366\n", - "Test Dataframe Row Count: \n", - "747\n" - ] - } - ], + "outputs": [], "source": [ "# Split the data into train and test\n", "splits = trainingData1.randomSplit([0.8, 0.2], 1234)\n", @@ -849,208 +359,9 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "747\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidityfeatureslabelrawPredictionprobabilityprediction
0810.00.0650.000.99467.315.03.397.01.20.4721.00.650[10.0, 0.06499999761581421, 0.0, 0.99459999799...1.0[-116.61332062455122, -116.11569707646935, -11...[0.3624807686165096, 0.5962112012919782, 0.038...1.0
1309.80.0820.000.99647.88.03.386.02.00.5916.00.645[9.800000190734863, 0.0820000022649765, 0.0, 0...0.0[-107.00826876792438, -106.69331316801713, -10...[0.4109511317563823, 0.5630846130734202, 0.023...1.0
24310.50.3320.200.99687.58.03.216.02.60.9014.00.490[10.5, 0.3319999873638153, 0.20000000298023224...0.0[-112.66442184314677, -112.30188509993125, -11...[0.3992810255077367, 0.5737548627763441, 0.022...1.0
312910.50.0650.160.99628.03.03.427.01.80.9216.00.590[10.5, 0.06499999761581421, 0.1599999964237213...1.0[-102.8833290398864, -102.54742453805858, -106...[0.40916428022145745, 0.5725048690933248, 0.01...1.0
42219.40.0820.370.99647.824.03.346.02.00.5958.00.340[9.399999618530273, 0.0820000022649765, 0.3700...0.0[-151.32966193268658, -151.34435617974762, -15...[0.4777992886623408, 0.47082971956014313, 0.04...0.0
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 8 10.0 0.065 0.00 0.9946 7.3 15.0 \n", - "1 30 9.8 0.082 0.00 0.9964 7.8 8.0 \n", - "2 43 10.5 0.332 0.20 0.9968 7.5 8.0 \n", - "3 129 10.5 0.065 0.16 0.9962 8.0 3.0 \n", - "4 221 9.4 0.082 0.37 0.9964 7.8 24.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \\\n", - "0 3.39 7.0 1.2 0.47 21.0 0.650 \n", - "1 3.38 6.0 2.0 0.59 16.0 0.645 \n", - "2 3.21 6.0 2.6 0.90 14.0 0.490 \n", - "3 3.42 7.0 1.8 0.92 16.0 0.590 \n", - "4 3.34 6.0 2.0 0.59 58.0 0.340 \n", - "\n", - " features label \\\n", - "0 [10.0, 0.06499999761581421, 0.0, 0.99459999799... 1.0 \n", - "1 [9.800000190734863, 0.0820000022649765, 0.0, 0... 0.0 \n", - "2 [10.5, 0.3319999873638153, 0.20000000298023224... 0.0 \n", - "3 [10.5, 0.06499999761581421, 0.1599999964237213... 1.0 \n", - "4 [9.399999618530273, 0.0820000022649765, 0.3700... 0.0 \n", - "\n", - " rawPrediction \\\n", - "0 [-116.61332062455122, -116.11569707646935, -11... \n", - "1 [-107.00826876792438, -106.69331316801713, -10... \n", - "2 [-112.66442184314677, -112.30188509993125, -11... \n", - "3 [-102.8833290398864, -102.54742453805858, -106... \n", - "4 [-151.32966193268658, -151.34435617974762, -15... \n", - "\n", - " probability prediction \n", - "0 [0.3624807686165096, 0.5962112012919782, 0.038... 1.0 \n", - "1 [0.4109511317563823, 0.5630846130734202, 0.023... 1.0 \n", - "2 [0.3992810255077367, 0.5737548627763441, 0.022... 1.0 \n", - "3 [0.40916428022145745, 0.5725048690933248, 0.01... 1.0 \n", - "4 [0.4777992886623408, 0.47082971956014313, 0.04... 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "nb = NaiveBayes(smoothing=1.0, modelType=\"multinomial\")\n", "\n", @@ -1065,96 +376,9 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
qualitylabelpredictionprobability
07.01.01.0[0.3624807686165096, 0.5962112012919782, 0.038...
16.00.01.0[0.4109511317563823, 0.5630846130734202, 0.023...
26.00.01.0[0.3992810255077367, 0.5737548627763441, 0.022...
37.01.01.0[0.40916428022145745, 0.5725048690933248, 0.01...
46.00.00.0[0.4777992886623408, 0.47082971956014313, 0.04...
\n", - "
" - ], - "text/plain": [ - " quality label prediction \\\n", - "0 7.0 1.0 1.0 \n", - "1 6.0 0.0 1.0 \n", - "2 6.0 0.0 1.0 \n", - "3 7.0 1.0 1.0 \n", - "4 6.0 0.0 0.0 \n", - "\n", - " probability \n", - "0 [0.3624807686165096, 0.5962112012919782, 0.038... \n", - "1 [0.4109511317563823, 0.5630846130734202, 0.023... \n", - "2 [0.3992810255077367, 0.5737548627763441, 0.022... \n", - "3 [0.40916428022145745, 0.5725048690933248, 0.01... \n", - "4 [0.4777992886623408, 0.47082971956014313, 0.04... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(predictions.select(\"quality\", \"label\", \"prediction\", \"probability\"))" ] @@ -1168,17 +392,9 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test set accuracy = 0.6010709504685409\n" - ] - } - ], + "outputs": [], "source": [ "# compute accuracy on the test set\n", "evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\",\n", @@ -1186,26 +402,6 @@ "accuracy = evaluator.evaluate(predictions)\n", "print(\"Test set accuracy = \" + str(accuracy))" ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.execute(\"\"\"drop table wines\"\"\")" - ] } ], "metadata": { diff --git a/jupyter/Random Forest.ipynb b/jupyter/Random Forest.ipynb index d548234..7fdd499 100755 --- a/jupyter/Random Forest.ipynb +++ b/jupyter/Random Forest.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -76,14 +76,6 @@ " pandas.reset_option('display.max_rows')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# DataStax Enterprise Analytics\n", - "" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -95,12 +87,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Connect to DSE Analytics Cluster" + "### Connect to Cassandra" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -119,20 +111,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.execute(\"\"\"\n", " CREATE KEYSPACE IF NOT EXISTS accelerate \n", @@ -150,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -166,20 +147,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS wines \\\n", " (wineid int, fixedAcidity float, volatileAcidity float, citricAcid float, sugar float, \\\n", @@ -229,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -267,7 +237,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Machine Learning with DSE Analytics and Apache Spark\n", + "## Machine Learning with Apache Cassandra and Apache Spark\n", "" ] }, @@ -275,23 +245,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Create a spark session that is connected to DSE. From there load each table into a Spark Dataframe and take a count of the number of rows in each." + "#### Create a spark session that is connected to the database. From there load each table into a Spark Dataframe and take a count of the number of rows in each." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table Wine Row Count: \n", - "6497\n" - ] - } - ], + "outputs": [], "source": [ "spark = SparkSession.builder.appName('demo').master(\"local\").getOrCreate()\n", "\n", @@ -304,150 +265,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidity
0569110.00.0570.280.994256.421.03.266.07.90.3682.00.14
17289.50.0670.020.997006.44.03.465.01.80.6811.00.57
2649011.80.0360.290.989386.125.03.066.02.20.44100.00.34
32089.30.0690.310.996257.826.03.295.01.80.53120.00.57
4193910.20.0490.350.993406.649.03.437.01.50.85141.00.18
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 5691 10.0 0.057 0.28 0.99425 6.4 21.0 \n", - "1 728 9.5 0.067 0.02 0.99700 6.4 4.0 \n", - "2 6490 11.8 0.036 0.29 0.98938 6.1 25.0 \n", - "3 208 9.3 0.069 0.31 0.99625 7.8 26.0 \n", - "4 1939 10.2 0.049 0.35 0.99340 6.6 49.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \n", - "0 3.26 6.0 7.9 0.36 82.0 0.14 \n", - "1 3.46 5.0 1.8 0.68 11.0 0.57 \n", - "2 3.06 6.0 2.2 0.44 100.0 0.34 \n", - "3 3.29 5.0 1.8 0.53 120.0 0.57 \n", - "4 3.43 7.0 1.5 0.85 141.0 0.18 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(wineDF)" ] @@ -461,150 +281,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidity
030889.50.0440.740.997206.568.03.186.013.30.54224.00.260
1639510.70.0350.290.991426.444.03.177.01.10.55140.00.105
23819.40.0800.420.997408.311.03.216.02.00.8027.00.260
345299.10.0460.290.998346.639.03.056.014.40.50118.00.220
4431910.90.0290.310.992767.553.03.036.06.50.38160.00.180
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 3088 9.5 0.044 0.74 0.99720 6.5 68.0 \n", - "1 6395 10.7 0.035 0.29 0.99142 6.4 44.0 \n", - "2 381 9.4 0.080 0.42 0.99740 8.3 11.0 \n", - "3 4529 9.1 0.046 0.29 0.99834 6.6 39.0 \n", - "4 4319 10.9 0.029 0.31 0.99276 7.5 53.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \n", - "0 3.18 6.0 13.3 0.54 224.0 0.260 \n", - "1 3.17 7.0 1.1 0.55 140.0 0.105 \n", - "2 3.21 6.0 2.0 0.80 27.0 0.260 \n", - "3 3.05 6.0 14.4 0.50 118.0 0.220 \n", - "4 3.03 6.0 6.5 0.38 160.0 0.180 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "wine6DF = wineDF.filter(\"quality > 5\")\n", "showDF(wine6DF)" @@ -619,176 +298,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidityfeatureslabel
0431710.80.0460.290.995186.859.03.206.010.40.40143.00.16[10.800000190734863, 0.04600000008940697, 0.28...0.0
1337210.90.0590.260.995507.832.03.046.09.50.43178.00.40[10.899999618530273, 0.05900000035762787, 0.25...0.0
248309.40.0560.570.995486.760.02.966.06.60.43150.00.13[9.399999618530273, 0.0560000017285347, 0.5699...0.0
327319.70.0470.340.994406.924.03.206.04.00.52128.00.23[9.699999809265137, 0.04699999839067459, 0.340...0.0
47699.70.0820.020.997447.124.03.556.02.30.5394.00.59[9.699999809265137, 0.0820000022649765, 0.0199...0.0
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 4317 10.8 0.046 0.29 0.99518 6.8 59.0 \n", - "1 3372 10.9 0.059 0.26 0.99550 7.8 32.0 \n", - "2 4830 9.4 0.056 0.57 0.99548 6.7 60.0 \n", - "3 2731 9.7 0.047 0.34 0.99440 6.9 24.0 \n", - "4 769 9.7 0.082 0.02 0.99744 7.1 24.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \\\n", - "0 3.20 6.0 10.4 0.40 143.0 0.16 \n", - "1 3.04 6.0 9.5 0.43 178.0 0.40 \n", - "2 2.96 6.0 6.6 0.43 150.0 0.13 \n", - "3 3.20 6.0 4.0 0.52 128.0 0.23 \n", - "4 3.55 6.0 2.3 0.53 94.0 0.59 \n", - "\n", - " features label \n", - "0 [10.800000190734863, 0.04600000008940697, 0.28... 0.0 \n", - "1 [10.899999618530273, 0.05900000035762787, 0.25... 0.0 \n", - "2 [9.399999618530273, 0.0560000017285347, 0.5699... 0.0 \n", - "3 [9.699999809265137, 0.04699999839067459, 0.340... 0.0 \n", - "4 [9.699999809265137, 0.0820000022649765, 0.0199... 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "4113\n" - ] - } - ], + "outputs": [], "source": [ "assembler = VectorAssembler(\n", " inputCols=['alcohol', 'chlorides', 'citricacid', 'density', 'fixedacidity', 'ph', 'freesulfur', 'sugar', 'sulphates', 'totalsulfur', 'volatileacidity'],\n", @@ -812,20 +324,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Dataframe Row Count: \n", - "3359\n", - "Test Datafram Row Count: \n", - "752\n" - ] - } - ], + "outputs": [], "source": [ "# Split the data into train and test\n", "splits = trainingData1.randomSplit([0.8, 0.2], 1234)\n", @@ -840,225 +341,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wed May 15 18:10:26 PDT 2019\r\n" - ] - } - ], - "source": [ - "!date" - ] - }, - { - "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "752\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
wineidalcoholchloridescitricaciddensityfixedacidityfreesulfurphqualitysugarsulphatestotalsulfurvolatileacidityfeatureslabelrawPredictionprobabilityprediction
099.50.0730.020.99687.89.03.367.02.00.5718.00.58[9.5, 0.0729999989271164, 0.019999999552965164...1.0[9.396850311426318, 0.5508459750593168, 0.0523...[0.9396850311426318, 0.05508459750593168, 0.00...0.0
11710.50.0920.560.99698.535.03.307.01.80.75103.00.28[10.5, 0.09200000017881393, 0.5600000023841858...1.0[7.872448026522618, 1.81795063588903, 0.294216...[0.7872448026522617, 0.18179506358890296, 0.02...0.0
2219.40.0770.480.99688.929.03.396.01.80.5360.00.22[9.399999618530273, 0.07699999958276749, 0.479...0.0[9.241572488316299, 0.670204601565711, 0.07725...[0.9241572488316301, 0.06702046015657112, 0.00...0.0
31099.60.0910.530.99768.018.03.376.02.50.8080.00.33[9.600000381469727, 0.09099999815225601, 0.529...0.0[8.585164908248752, 1.162295200618395, 0.24652...[0.858516490824875, 0.1162295200618395, 0.0246...0.0
427010.10.1040.510.999611.54.03.286.04.00.9723.00.18[10.100000381469727, 0.10400000214576721, 0.50...0.0[7.155537932285342, 2.686756186951431, 0.15402...[0.7155537932285343, 0.26867561869514317, 0.01...0.0
\n", - "
" - ], - "text/plain": [ - " wineid alcohol chlorides citricacid density fixedacidity freesulfur \\\n", - "0 9 9.5 0.073 0.02 0.9968 7.8 9.0 \n", - "1 17 10.5 0.092 0.56 0.9969 8.5 35.0 \n", - "2 21 9.4 0.077 0.48 0.9968 8.9 29.0 \n", - "3 109 9.6 0.091 0.53 0.9976 8.0 18.0 \n", - "4 270 10.1 0.104 0.51 0.9996 11.5 4.0 \n", - "\n", - " ph quality sugar sulphates totalsulfur volatileacidity \\\n", - "0 3.36 7.0 2.0 0.57 18.0 0.58 \n", - "1 3.30 7.0 1.8 0.75 103.0 0.28 \n", - "2 3.39 6.0 1.8 0.53 60.0 0.22 \n", - "3 3.37 6.0 2.5 0.80 80.0 0.33 \n", - "4 3.28 6.0 4.0 0.97 23.0 0.18 \n", - "\n", - " features label \\\n", - "0 [9.5, 0.0729999989271164, 0.019999999552965164... 1.0 \n", - "1 [10.5, 0.09200000017881393, 0.5600000023841858... 1.0 \n", - "2 [9.399999618530273, 0.07699999958276749, 0.479... 0.0 \n", - "3 [9.600000381469727, 0.09099999815225601, 0.529... 0.0 \n", - "4 [10.100000381469727, 0.10400000214576721, 0.50... 0.0 \n", - "\n", - " rawPrediction \\\n", - "0 [9.396850311426318, 0.5508459750593168, 0.0523... \n", - "1 [7.872448026522618, 1.81795063588903, 0.294216... \n", - "2 [9.241572488316299, 0.670204601565711, 0.07725... \n", - "3 [8.585164908248752, 1.162295200618395, 0.24652... \n", - "4 [7.155537932285342, 2.686756186951431, 0.15402... \n", - "\n", - " probability prediction \n", - "0 [0.9396850311426318, 0.05508459750593168, 0.00... 0.0 \n", - "1 [0.7872448026522617, 0.18179506358890296, 0.02... 0.0 \n", - "2 [0.9241572488316301, 0.06702046015657112, 0.00... 0.0 \n", - "3 [0.858516490824875, 0.1162295200618395, 0.0246... 0.0 \n", - "4 [0.7155537932285343, 0.26867561869514317, 0.01... 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\", numTrees=10)\n", "\n", @@ -1072,113 +357,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Wed May 15 18:10:30 PDT 2019\r\n" - ] - } - ], - "source": [ - "!date" - ] - }, - { - "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
qualitylabelpredictionprobability
07.01.00.0[0.9396850311426318, 0.05508459750593168, 0.00...
17.01.00.0[0.7872448026522617, 0.18179506358890296, 0.02...
26.00.00.0[0.9241572488316301, 0.06702046015657112, 0.00...
36.00.00.0[0.858516490824875, 0.1162295200618395, 0.0246...
46.00.00.0[0.7155537932285343, 0.26867561869514317, 0.01...
\n", - "
" - ], - "text/plain": [ - " quality label prediction \\\n", - "0 7.0 1.0 0.0 \n", - "1 7.0 1.0 0.0 \n", - "2 6.0 0.0 0.0 \n", - "3 6.0 0.0 0.0 \n", - "4 6.0 0.0 0.0 \n", - "\n", - " probability \n", - "0 [0.9396850311426318, 0.05508459750593168, 0.00... \n", - "1 [0.7872448026522617, 0.18179506358890296, 0.02... \n", - "2 [0.9241572488316301, 0.06702046015657112, 0.00... \n", - "3 [0.858516490824875, 0.1162295200618395, 0.0246... \n", - "4 [0.7155537932285343, 0.26867561869514317, 0.01... " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(predictions.select(\"quality\", \"label\", \"prediction\", \"probability\"))" ] @@ -1192,17 +373,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test set accuracy = 0.7286096256684492\n" - ] - } - ], + "outputs": [], "source": [ "# compute accuracy on the test set\n", "evaluator = MulticlassClassificationEvaluator(labelCol=\"label\", predictionCol=\"prediction\",\n", @@ -1213,30 +386,14 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], "source": [ "session.execute(\"\"\"drop table wines\"\"\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1255,7 +412,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/jupyter/kmeans.ipynb b/jupyter/kmeans.ipynb index 15e0682..1df1eb7 100755 --- a/jupyter/kmeans.ipynb +++ b/jupyter/kmeans.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -81,7 +81,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# DataStax Enterprise Analytics\n", "" ] }, @@ -96,12 +95,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Connect to DSE Analytics Cluster" + "### Connect to Cassandra" ] }, { "cell_type": "code", - "execution_count": 102, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -120,20 +119,9 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.execute(\"\"\"\n", " CREATE KEYSPACE IF NOT EXISTS accelerate \n", @@ -151,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -167,20 +155,9 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "query = \"CREATE TABLE IF NOT EXISTS socialMedia \\\n", " (status_id int, social_type text, num_reactions int,\\\n", @@ -231,14 +208,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Load Car dataset from CSV file (socialMedia.csv)\n", + "### Load dataset from CSV file (socialMedia.csv)\n", "\n", - "#### Insert all the Car Data into the Apache Cassandra table `socialmedia`" + "#### Insert all the Data into the Apache Cassandra table `socialmedia`" ] }, { "cell_type": "code", - "execution_count": 107, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -264,7 +241,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Machine Learning with DSE Analytics and Apache Spark\n", + "## Machine Learning with Apache Spark\n", "" ] }, @@ -277,18 +254,9 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table Row Count: \n", - "6621\n" - ] - } - ], + "outputs": [], "source": [ "spark = SparkSession.builder.appName('demo').master(\"local\").getOrCreate()\n", "\n", @@ -301,138 +269,9 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_idnum_angrysnum_commentsnum_hahasnum_likesnum_lovesnum_reactionsnum_sadsnum_sharesnum_wowssocial_type
0569101013013010photo
17280101361138001photo
264900352982171724741631video
320806042143000photo
4193905024024000photo
\n", - "
" - ], - "text/plain": [ - " status_id num_angrys num_comments num_hahas num_likes num_loves \\\n", - "0 5691 0 1 0 13 0 \n", - "1 728 0 1 0 136 1 \n", - "2 6490 0 3529 8 217 17 \n", - "3 208 0 6 0 42 1 \n", - "4 1939 0 5 0 24 0 \n", - "\n", - " num_reactions num_sads num_shares num_wows social_type \n", - "0 13 0 1 0 photo \n", - "1 138 0 0 1 photo \n", - "2 247 4 163 1 video \n", - "3 43 0 0 0 photo \n", - "4 24 0 0 0 photo " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(socialDF)" ] @@ -446,144 +285,9 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_idnum_angrysnum_commentsnum_hahasnum_likesnum_lovesnum_reactionsnum_sadsnum_sharesnum_wowssocial_typelabel
0569101013013010photo0.0
17280101361138001photo0.0
264900352982171724741631video1.0
320806042143000photo0.0
4193905024024000photo0.0
\n", - "
" - ], - "text/plain": [ - " status_id num_angrys num_comments num_hahas num_likes num_loves \\\n", - "0 5691 0 1 0 13 0 \n", - "1 728 0 1 0 136 1 \n", - "2 6490 0 3529 8 217 17 \n", - "3 208 0 6 0 42 1 \n", - "4 1939 0 5 0 24 0 \n", - "\n", - " num_reactions num_sads num_shares num_wows social_type label \n", - "0 13 0 1 0 photo 0.0 \n", - "1 138 0 0 1 photo 0.0 \n", - "2 247 4 163 1 video 1.0 \n", - "3 43 0 0 0 photo 0.0 \n", - "4 24 0 0 0 photo 0.0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "labelIndexer = StringIndexer(inputCol=\"social_type\", outputCol=\"label\", handleInvalid='keep')\n", "training = labelIndexer.fit(socialDF).transform(socialDF)\n", @@ -593,100 +297,18 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
social_typelabel
0video1.0
1photo0.0
2photo0.0
3video1.0
4photo0.0
\n", - "
" - ], - "text/plain": [ - " social_type label\n", - "0 video 1.0\n", - "1 photo 0.0\n", - "2 photo 0.0\n", - "3 video 1.0\n", - "4 photo 0.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "showDF(training.select(\"social_type\",\"label\"))" ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+-----+\n", - "|social_type|count|\n", - "+-----------+-----+\n", - "| video| 2333|\n", - "| photo| 4288|\n", - "+-----------+-----+\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "training.groupBy('social_type').count().show()" ] @@ -706,33 +328,12 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 88, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "carPanda = training.toPandas()\n", - "carPanda.plot.scatter(x = 'num_likes', y = 'num_comments', c= 'label', figsize=(12,8), colormap='viridis')" + "smPanda = training.toPandas()\n", + "smPanda.plot.scatter(x = 'num_likes', y = 'num_comments', c= 'label', figsize=(12,8), colormap='viridis')" ] }, { @@ -757,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -779,163 +380,9 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
status_idnum_angrysnum_commentsnum_hahasnum_likesnum_lovesnum_reactionsnum_sadsnum_sharesnum_wowssocial_typelabelfeaturesprediction
0569101013013010photo0.0[13.0, 1.0]0
17280101361138001photo0.0[136.0, 1.0]0
264900352982171724741631video1.0[217.0, 3529.0]1
320806042143000photo0.0[42.0, 6.0]0
4193905024024000photo0.0[24.0, 5.0]0
\n", - "
" - ], - "text/plain": [ - " status_id num_angrys num_comments num_hahas num_likes num_loves \\\n", - "0 5691 0 1 0 13 0 \n", - "1 728 0 1 0 136 1 \n", - "2 6490 0 3529 8 217 17 \n", - "3 208 0 6 0 42 1 \n", - "4 1939 0 5 0 24 0 \n", - "\n", - " num_reactions num_sads num_shares num_wows social_type label \\\n", - "0 13 0 1 0 photo 0.0 \n", - "1 138 0 0 1 photo 0.0 \n", - "2 247 4 163 1 video 1.0 \n", - "3 43 0 0 0 photo 0.0 \n", - "4 24 0 0 0 photo 0.0 \n", - "\n", - " features prediction \n", - "0 [13.0, 1.0] 0 \n", - "1 [136.0, 1.0] 0 \n", - "2 [217.0, 3529.0] 1 \n", - "3 [42.0, 6.0] 0 \n", - "4 [24.0, 5.0] 0 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "kmeans = KMeans().setK(2).setSeed(1)\n", "model = kmeans.fit(trainingData)\n", @@ -957,30 +404,9 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------+-----+\n", - "|prediction|count|\n", - "+----------+-----+\n", - "| 1| 137|\n", - "| 0| 6484|\n", - "+----------+-----+\n", - "\n", - "+-----------+-----+\n", - "|social_type|count|\n", - "+-----------+-----+\n", - "| video| 2333|\n", - "| photo| 4288|\n", - "+-----------+-----+\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "predictions.groupBy('prediction').count().show()\n", "training.groupBy('social_type').count().show()" @@ -997,30 +423,9 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "car_df = predictions.toPandas()\n", "\n", @@ -1047,33 +452,6 @@ "source": [ "# Remember Data Science and analytics is an iterative process! It's a science! Hypothesis, test, analysis, and loop again! " ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.execute(\"\"\"drop table socialmedia\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -1092,7 +470,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.7.3" } }, "nbformat": 4,