diff --git a/cassandra/dev/drivers/php/beginner/build.gradle b/cassandra/dev/drivers/php/beginner/build.gradle new file mode 100755 index 00000000..56ed9608 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/build.gradle @@ -0,0 +1,17 @@ +plugins { + id 'com.datastax.gradle.curriculum.plugin' version '0.1.5' +} + +/* +buildscript { + repositories { + mavenLocal() + jcenter() + } + dependencies { + classpath 'com.datastax:gradle-curriculum-plugin:0.1.5-SNAPSHOT' + } +} +*/ + +apply plugin: 'com.datastax.gradle.curriculum.plugin' diff --git a/cassandra/dev/drivers/php/beginner/src/exercises.adoc b/cassandra/dev/drivers/php/beginner/src/exercises.adoc new file mode 100755 index 00000000..50677973 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/exercises.adoc @@ -0,0 +1,272 @@ + +== Data Modeling Use Case + +=== *Exercise 1: Reviewing the investment portfolio management use case* + +==== *In this exercise, you will:* + +* Review the conceptual, logical, and pysical designs +* Instantiate and query the database + +==== *_Steps_* + +==== *Review the conceptual, logical, and physical designs* + +. Review the data modeling steps. + +image::{image_path}/usecaseimage1.jpg[] + +image::{image_path}/investmentreview.svg[] + + +==== *Instantiate and query the database* + +. In cqlsh, create and populate the tables shown above by executing the CQL script. + +[source,sql] + SOURCE '~/casdat/exercise7/portfolio.cql'; + +. In cqlsh, execute the USE command to set the portfolio keyspace as the current default. + +[source,sql] + USE portfolio; + +. In cqlsh (or DevCenter), express the following queries in CQL over the portfolio database for a user with username ‘green’. + +Q1: Find all investment account information for a specified username. + +[source,sql] + SELECT * FROM accounts_by_user WHERE username = 'green'; + +Q2: Find all positions (stocks, quantities, market values) for a specified account number; order by stock symbol (ASC). + +[source,sql] + SELECT * FROM stocks_by_account WHERE account_number = 111111111; + +Q3: Find all trades for a specified account number and, optionally, a known date range, trade type (buy/sell), and stock symbol; order by trade date (DESC). + +Q3.1: Find all trades for a specified account number; order by trade date (DESC). + +[source,sql] +.... +SELECT * FROM trades_by_account12 WHERE account_number = 111111111; + +SELECT * FROM trades_by_account WHERE account_number = 111111111; +.... + +Q3.2: Find all trades for a specified account number and date range; order by trade date (DESC). + +[source,sql] +.... +SELECT * FROM trades_by_account12 WHERE account_number = 111111111 AND date > '2014-05-01'; + +SELECT * FROM trades_by_account WHERE account_number = 111111111 AND date > '2014-05-01'; +.... + +Q3.3: Find all trades for a specified account number, date range, and trade type; order by trade date (DESC). + +[source,sql] +.... +SELECT * FROM trades_by_account3 WHERE account_number = 111111111 AND date > '2014-01-01' AND type = 'Buy'; + +SELECT * FROM trades_by_account WHERE account_number = 111111111 AND date > '2014-01-01' AND type = 'Buy'; +.... + +Q3.4: Find all trades for a specified account number, date range, trade type, and stock symbol; order by trade date (DESC). + +[source,sql] +.... +SELECT * FROM trades_by_account4 WHERE account_number = 111111111 AND date > '2014-01-01' AND type = 'Buy' AND symbol = 'EBAY'; + +SELECT * FROM trades_by_account WHERE account_number = 111111111 AND date > '2014-01-01' AND type = 'Buy' AND symbol = 'EBAY' ALLOW FILTERING; +.... + +Q3.5: Find all trades for a specified account number, date range, and stock symbol; order by trade date (DESC). + +[source,sql] +.... +SELECT * FROM trades_by_account5 WHERE account_number = 111111111 AND date > '2014-01-01' AND symbol = 'EBAY'; + +SELECT * FROM trades_by_account WHERE account_number = 111111111 AND date > '2014-01-01' AND symbol = 'EBAY'; +.... + +Q4: Find all information about owners of investment accounts with a position in a specified stock. + +Q4.1: Find all bucket numbers for a specified stock. + +[source,sql] + SELECT * FROM buckets_by_stock WHERE symbol = 'NFLX'; + +Q4.2: Retrieve all user information in a known bucket for a specified stock. + +[source,sql] + SELECT * FROM users_by_stock WHERE symbol = 'NFLX' AND bucket IN (1,2); + +Q5: Find how many accounts have shares of a specified stock. + +[source,sql] + SELECT * FROM accounts_by_stock WHERE symbol = 'NFLX'; + +=== *Appendix* + +==== *Create Keyspace and Tables* + +[source,sql] +.... +CREATE KEYSPACE portfolio +WITH replication = { + 'class': 'SimpleStrategy', + 'replication_factor' : 1 +}; + +USE portfolio; +.... + +[source,sql] +.... +CREATE TABLE accounts_by_user ( + username TEXT, + name TEXT, + ssn INT, + dob TIMESTAMP, + emails MAP, + phones MAP, + account_number INT, + cash_balance DECIMAL, + investment_value DECIMAL, + total_value DECIMAL, + PRIMARY KEY (username) + ); +.... + +[source,sql] +.... +CREATE TABLE stocks_by_account ( + account_number INT, + symbol TEXT, + description TEXT, + quote DECIMAL, + quantity DECIMAL, + market_value DECIMAL, + PRIMARY KEY (account_number, symbol) + ); +.... + +[source,sql] +.... +CREATE TABLE trades_by_account12 ( + account_number INT, + date TIMESTAMP, + type TEXT, + symbol TEXT, + trade_id UUID, + quantity DECIMAL, + price DECIMAL, + amount DECIMAL, + PRIMARY KEY (account_number, date, trade_id) + ) WITH CLUSTERING ORDER BY (date DESC); +.... + +[source,sql] +.... +CREATE TABLE trades_by_account3 ( + account_number INT, + date TIMESTAMP, + type TEXT, + symbol TEXT, + trade_id UUID, + quantity DECIMAL, + price DECIMAL, + amount DECIMAL, + PRIMARY KEY (account_number, type, date, trade_id) + ) WITH CLUSTERING ORDER BY (type ASC, date DESC); +.... + +[source,sql] +.... +CREATE TABLE trades_by_account4 ( + account_number INT, + date TIMESTAMP, + type TEXT, + symbol TEXT, + trade_id UUID, + quantity DECIMAL, + price DECIMAL, + amount DECIMAL, + PRIMARY KEY (account_number, symbol, type, date, trade_id) + ) WITH CLUSTERING ORDER BY (symbol ASC, type ASC, date DESC); +.... + +[source,sql] +.... +CREATE TABLE trades_by_account5 ( + account_number INT, + date TIMESTAMP, + type TEXT, + symbol TEXT, + trade_id UUID, + quantity DECIMAL, + price DECIMAL, + amount DECIMAL, + PRIMARY KEY (account_number, symbol, date, trade_id) + ) WITH CLUSTERING ORDER BY (symbol ASC, date DESC); +.... + +_We kept trades_by_account for demonstration purposes_ + +[source,sql] +.... +CREATE TABLE trades_by_account ( + account_number INT, + date TIMESTAMP, + type TEXT, + symbol TEXT, + trade_id UUID, + quantity DECIMAL, + price DECIMAL, + amount DECIMAL, + PRIMARY KEY (account_number, date, trade_id) + ) WITH CLUSTERING ORDER BY (date DESC); + + CREATE INDEX on trades_by_account (type); + CREATE INDEX on trades_by_account (symbol); +.... + +[source,sql] +.... +CREATE TABLE buckets_by_stock ( + symbol TEXT, + bucket INT, + rows COUNTER, + PRIMARY KEY (symbol, bucket) + ); +.... + +[source,sql] +.... +CREATE TABLE users_by_stock ( + symbol TEXT, + bucket INT, + username TEXT, + name TEXT, + ssn INT, + dob TIMESTAMP, + emails MAP, + phones MAP, + account_number INT, + PRIMARY KEY ((symbol, bucket), username) + ); +.... + +[source,sql] +.... +CREATE TABLE accounts_by_stock ( + symbol TEXT, + accounts COUNTER, + PRIMARY KEY (symbol) + ); +.... + + + +*END OF EXERCISE* diff --git a/cassandra/dev/drivers/php/beginner/src/includes.adoc b/cassandra/dev/drivers/php/beginner/src/includes.adoc new file mode 100755 index 00000000..9f46c6c9 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/includes.adoc @@ -0,0 +1 @@ +include::{slide_path}/php_beginner.adoc[] diff --git a/cassandra/dev/drivers/php/beginner/src/notes.adoc b/cassandra/dev/drivers/php/beginner/src/notes.adoc new file mode 100755 index 00000000..73a91e12 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/notes.adoc @@ -0,0 +1,99 @@ +//// +In order to hide the instructor comments and make this file student-notes instead, +add a ! to the end of instructor, like: +:instructor!: +//// +//// +This attribute is used to show/hide the instructor-only notes in this file. +//// +:instructor!: + +== *DS220 Apache Cassandra Data Modeling* + +=== *Data Modeling Use Case* + +==== *Slide 1: What are sensor applications?* +Sensors are everywhere, and Cassandra is a wonderful database to capture this kind +of data. Cassandra is designed to handle time-series data, and when you think about it, +it is clear that sensor applications exist to understand the _change_ in values that +sensors are measuring. + +ifdef::instructor[] +[NOTE] +*Instructor:* This might be a good opportunity to open the floor to discussion about Cassandra and sensor applications. +endif::instructor[] + +==== *Slide 2: Sensor data: use case introduction* +Generally, sensors are deployed in networks, but note that each sensor is uniquely +identified and arrayed. Although the sensors may be completely identical, the geographical +location or person to whom sensors are attached must be trackable in order to aggregate the +data that will be collected. + +ifdef::instructor[] +[NOTE] +*Instructor:* These are a quick round-up and review of everything taught over +the course of 2 days. Do not get bogged down, but summarize what was +taught and bridge each slide together with their relationship. +endif::instructor[] + +==== *Slide 3: More Sensor data: use case introduction* +When modeling the data and how you will store and retrieve it in Cassandra, it is +vital to understand that the design is driven by the queries that you will make. +The retrieval of data plays a crucial role in how to optimize the storage within +the Cassandra database. + +ifdef::instructor[] +[NOTE] +*Instructor:* It cannot be overemphasized that *query-driven* design is important +in Cassandra. This may seem like a departure for the relational database designers +in your audience. +endif::instructor[] + +==== *Slide 4: Sensor data: conceptual data model* +This conceptual data model shows the relationship of the data entities. + +ifdef::instructor[] +[NOTE] +*Instructor:* Remind participants that the conceptual data model is database-agnostic, +and uses Chen notation. +endif::instructor[] + +==== *Slide 5: Sensor data: application workflow* +This slide shows the relationship of the queries, and how you might expect a query +to access sensor data. + +==== *Slide 6: Sensor data: logical data model* +A logical data model, to review, puts together the conceptual model with data types +and defines the partition key and primary keys of the various Cassandra tables that +will be used by the application. + +ifdef::instructor[] +[NOTE] +*Instructor:* This is the meat of this example. Discuss WHY certain choices are made +for the primary key and clustering columns, in order to answer the queries that were +designed. +endif::instructor[] + +==== *Slides 7-9: Sensor data: analysis* +Understanding the partition size and number of partitions is the final key to ensuring +that the table design will work when scaled out on Cassandra. Duplication must be +considered as part of the design, to be sure that the application will not be +overwhelmed with updating tables when the data starts rolling in. + +ifdef::instructor[] +[NOTE] +*Instructor:* The estimates done here are simply to give a magnitude of the scale +that may result from the table design. Note that the analysis done here points to +storing weekly data to end up with manageable partitions. +endif::instructor[] + +==== *Slide 10-14: Sensor data: physical data model* +Each table must be carefully designed to answer a question that the user has. + +ifdef::instructor[] +[NOTE] +*Instructor:* Some questions to ask the students as you go through the tables are: +(1) what similarities do you see in the tables? +(2) what differences? +(3) why is clustering order important? +endif::instructor[] diff --git a/cassandra/dev/drivers/php/beginner/src/objectives.adoc b/cassandra/dev/drivers/php/beginner/src/objectives.adoc new file mode 100755 index 00000000..c953260f --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/objectives.adoc @@ -0,0 +1,12 @@ +== Sensor Data Use Case + +=== Learning Objectives + +*The objectives for this leaf are:* + +* You will be able to identify examples of sensor applications. +* You will be able to create a conceptual data model for sensor data. +* You will be able to examine an application workflow for a sensor application for characteristics. +* You will be able to create a useful logical data model for sensor data. +* You will be able to analyze and validate sensor data using a previously created logical data model. +* You will be able to create a physical data model for sensor data. diff --git a/cassandra/dev/drivers/php/beginner/src/outline.adoc b/cassandra/dev/drivers/php/beginner/src/outline.adoc new file mode 100755 index 00000000..0fa9e50b --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/outline.adoc @@ -0,0 +1,11 @@ +== Sensor Data Use Case + +=== Outline + +* What are sensor applications? +* Sensor data: use case introduction +* Sensor data: conceptual data model +* Sensor data: application workflow +* Sensor data: logical data model +* Sensor data: analysis +* Sensor data: physical data model diff --git a/cassandra/dev/drivers/php/beginner/src/slides.adoc b/cassandra/dev/drivers/php/beginner/src/slides.adoc new file mode 100755 index 00000000..5afdd2d2 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/slides.adoc @@ -0,0 +1,14 @@ += Getting Started with the PHP Driver +DataStax Training +:backend: deckjs +:deckjs_theme: datastax +:deckjs_transition: fade +:navigation: +:status: +:notes: +:split: + + +:slide_path: slides +:image_path: images +include::includes.adoc[] diff --git a/cassandra/dev/drivers/php/beginner/src/slides/php_beginner.adoc b/cassandra/dev/drivers/php/beginner/src/slides/php_beginner.adoc new file mode 100644 index 00000000..a20aff68 --- /dev/null +++ b/cassandra/dev/drivers/php/beginner/src/slides/php_beginner.adoc @@ -0,0 +1,181 @@ +== Let's Get Started! + +* Get Cassandra +* Download the latest version of Cassandra from datastax.com + +== Set up your schema + +* We will be focusing on the KillrVideo app, namely the users table. Let's first create our keyspace. + +[source, sql] +---- +//Create the keyspace +CREATE KEYSPACE killrvideo WITH REPLICATION = +{ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }; + +//Use the keyspace +USE killrvideo; +---- + +== Set up your schema + +* Let's create the "users" table: + +[source, sql] +---- +// Users keyed by id +CREATE TABLE users ( + userid uuid, + firstname text, + lastname text, + email text, + created_date timestamp, + PRIMARY KEY (userid) +); +---- + +== Download the driver + +* First install the 2.0 version of the C/C++ driver + + +* Install with pecl: + +[source, bash] +---- +pecl install cassandra +---- + + +* Checkout the Datastax PHP Driver on GitHub + +== Setup + +* For this demo, we’re going to be creating a simple console application. +* Open a text editor and create a PHP file. +* First we need to connect to our cluster and the “killrvideo” keyspace. + +[source, php] +---- +build(); +$keyspace = 'killrvideo'; +$session = $cluster->connect($keyspace); +---- + +== Do an INSERT + +* Now that you are connected to the “killrvideo” keyspace, let’s insert a user into +the “users” table + +[source, php] +---- +execute(new Cassandra\SimpleStatement( + "INSERT INTO users (userid, created_date, email, firstname, lastname) + VALUES (14c532ac-f5ae-479a-9d0a-36604732e01d, '2013-01-01 00:00:00', + 'luke@example.com','Luke','Tillman')" + )); +---- + +== SELECT + +* Using the Datastax PHP Cassandra driver, we can easily pull the user back out + +[source, php] +---- +execute(new Cassandra\SimpleStatement + ("SELECT firstname, lastname, email FROM killrvideo.users + WHERE userid=14c532ac-f5ae-479a-9d0a-36604732e01d")); + +foreach ($result as $row) { + printf("user: \"%s\" \"%s\" email: \"%s\" \n", $row['firstname'], + $row['lastname'], $row['email']); + } +---- + +* The output: +[source, php] +---- +user: "Luke" "Tillman" email: "luke@example.com" +---- + +== Do an UPDATE + +* Supposing a user wanted to update their email address in the system + +[source, php] +---- +execute(new Cassandra\SimpleStatement + ("UPDATE users SET email = 'language_evangelist@example.com' + WHERE userid = 14c532ac-f5ae-479a-9d0a-36604732e01d")); +---- + +== Do an UPDATE + +[source, php] +---- +execute(new Cassandra\SimpleStatement + ("SELECT firstname, lastname, email FROM killrvideo.users + WHERE userid=14c532ac-f5ae-479a-9d0a-36604732e01d")); + +foreach ($result as $row) { + printf("user: \"%s\" \"%s\" email: \"%s\" \n", $row['firstname'], + $row['lastname'], $row['email']); + } +---- + +* The output: + +[source, php] +---- +user: "Luke" "Tillman" email: "language_evangelist@example.com" +---- + +== DELETE a user + +* Now let’s delete our user from the table +* Then we can print out all the rows. + +[source, php] +---- +execute(new Cassandra\SimpleStatement + ("DELETE FROM users WHERE userid = 14c532ac-f5ae-479a-9d0a-36604732e01d")); +---- + +== DELETE a user + +* You’ll notice that the user's information no longer comes back after being deleted +(others might, if you have inserted users previously). + +[source, php] +---- +execute(new Cassandra\SimpleStatement + ("SELECT firstname, lastname, email FROM killrvideo.users + WHERE userid=14c532ac-f5ae-479a-9d0a-36604732e01d")); + +foreach ($result as $row) { + printf("user: \"%s\" \"%s\" email: \"%s\" \n", $row['firstname'], + $row['lastname'], $row['email']); + } + +?> +---- + +== It's just that easy! + +* CQL is very similar to SQL, in many cases the same syntax will work +* This makes querying for data very straightforward if you have a background with relational databases. diff --git a/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/exercises.adoc b/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/exercises.adoc index 96023a85..098f720f 100644 --- a/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/exercises.adoc +++ b/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/exercises.adoc @@ -31,7 +31,7 @@ image:{image_path}/ninja.jpg[width="40%"] [source,sql] -- -CREATE TABLE bad_videos_by_tag_year_table ( +CREATE TABLE bad_videos_by_tag_year ( tag text, added_year int, added_date timestamp, @@ -43,7 +43,7 @@ CREATE TABLE bad_videos_by_tag_year_table ( ); -- -3) As an aside, use DESCRIBE TABLE to view the structure of your `bad_videos_by_tag_year_table` table. +3) As an aside, use DESCRIBE TABLE to view the structure of your `bad_videos_by_tag_year` table. NOTE: Notice the column order differs from the CREATE TABLE statement. Cassandra orders columns by partition key, clustering columns (shown later), and then alphabetical order of the remaining columns. @@ -51,30 +51,28 @@ NOTE: Notice the column order differs from the CREATE TABLE statement. Cassandra [source,sql] -- -COPY bad_videos_by_tag_year_table (tag, added_year, video_id, added_date, description, title, user_id) FROM 'videos_by_tag_year.csv' WITH HEADER=true; +COPY bad_videos_by_tag_year (tag, added_year, video_id, added_date, description, title, user_id) FROM 'videos_by_tag_year.csv' WITH HEADER=true; -- NOTE: We must explicitly list the column names because this table schema no longer matches the CSV structure. -NOTE: The number of imported rows differs from our `videos` table because 'videos_by_tag_year.csv' duplicates a video record for each `tag` and `added_year` combo that the video has. We will discuss more on denormalization techniques later. - -Note the number of imported rows. Now COUNT() the number of rows in the `bad_videos_by_tag_year_table`. +Note the number of imported rows. Now COUNT() the number of rows in the `bad_videos_by_tag_year`. [source,sql] -- SELECT COUNT(*) -FROM bad_videos_by_tag_year_table; +FROM bad_videos_by_tag_year; -- -Notice the number of rows in the `bad_videos_by_tag_year_table` does not match the number of rows imported from `videos_by_tag_year.csv`. Since `videos_by_tag_year.csv` duplicates `video_id` for each unique `tag` and `year` per video, Cassandra upserted several records during the COPY. `video_id` is not a proper partition key for this scenario. +Notice the number of rows in the `bad_videos_by_tag_year` does not match the number of rows imported from `videos_by_tag_year.csv`. Since `videos_by_tag_year.csv` duplicates `video_id` for each unique `tag` and `year` per video, Cassandra upserted several records during the COPY. `video_id` is not a proper partition key for this scenario. 5) Drop your nasty table. [source,sql] -DROP TABLE bad_videos_by_tag_year_table; +DROP TABLE bad_videos_by_tag_year; |=== -| Your mission is to restructure your table and allow users to query on `tag` and possible `year` ranges while avoiding upserts on import. You must also return your results in descending order of year.| image:{image_path}/mission.jpg[width="30%",float="right"] +| Your mission is to restructure your table and allow users to query on `tag` and possible `added_year` ranges while avoiding upserts on import. You must also return your results in descending order of year.| image:{image_path}/mission.jpg[width="30%",float="right"] |=== === *_Steps_* diff --git a/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/solutions.adoc b/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/solutions.adoc index 87080d5d..e6c8d4fe 100644 --- a/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/solutions.adoc +++ b/courses/DS220/data-modeling/data-modeling-wins/challenge-3/src/solutions.adoc @@ -5,7 +5,7 @@ === *Steps* [source,sql] -CREATE TABLE bad_videos_by_tag_year_table ( +CREATE TABLE bad_videos_by_tag_year ( tag text, added_year int, added_date timestamp, @@ -17,14 +17,14 @@ CREATE TABLE bad_videos_by_tag_year_table ( ); [source,sql] -COPY bad_videos_by_tag_year_table (tag, added_year, video_id, added_date, description, title, user_id) FROM 'videos_by_tag_year.csv' WITH HEADER=true; +COPY bad_videos_by_tag_year (tag, added_year, video_id, added_date, description, title, user_id) FROM 'videos_by_tag_year.csv' WITH HEADER=true; [source,sql] SELECT COUNT(*) -FROM bad_videos_by_tag_year_table; +FROM bad_videos_by_tag_year; [source,sql] -DROP TABLE bad_videos_by_tag_year_table; +DROP TABLE bad_videos_by_tag_year; [source,sql] CREATE TABLE videos_by_tag_year (