forked from fedorst/dataeng-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analysis.sql
125 lines (113 loc) · 4.5 KB
/
analysis.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* 1. What is the most common source that meme entries originate from? */
SELECT ORIGIN,
SUM(COUNT) AS COUNT
FROM fact_table
LEFT JOIN origin_dimension ON fact_table.ORIGIN_ID = origin_dimension.ORIGIN_ID
GROUP BY ORIGIN
ORDER BY COUNT DESC LIMIT 1;
/* Answer: YouTube */
/* 2. What sub-category (type) of memes was the most popular in the period from 2016 to 2018? */
SELECT type_dimension.MEME_TYPE,
COUNT(*)
FROM fact_table
LEFT JOIN date_dimension ON fact_table.DATE_ID = date_dimension.DATE_ID
LEFT JOIN type_dimension ON fact_table.TYPE_ID = type_dimension.TYPE_ID
WHERE date_dimension.YEAR BETWEEN 2016 AND 2018
GROUP BY MEME_TYPE
ORDER BY COUNT DESC;
/* Answer: 'No type', followed by 'catchphrase' */
/* 3. What year produced the largest number of memes featuring any catchphrase? */
SELECT date_dimension.YEAR,
COUNT(*)
FROM fact_table
LEFT JOIN date_dimension ON fact_table.DATE_ID = date_dimension.DATE_ID
LEFT JOIN type_dimension ON fact_table.TYPE_ID = type_dimension.TYPE_ID
WHERE type_dimension.MEME_TYPE = 'catchphrase'
GROUP BY date_dimension.YEAR
ORDER BY COUNT DESC;
/* ANSWER: 2009 */
/* 4. What year has seen the highest number of submissions to the KYM database? */
SELECT COUNT(*),
LEFT(ADDED_TO_KYM, 4) AS YEAR
FROM additional_analysis_table
GROUP BY YEAR
ORDER BY COUNT DESC;
/* ANSWER: 2009 */
/* 5. What are the most common tags among different memes types? */
SELECT type_dimension.MEME_TYPE,
MODE() WITHIN GROUP (
ORDER BY tag_dimension.TAG
) AS MOST_FREQUENT_TAG
FROM fact_table
LEFT JOIN tag_dimension ON fact_table.TAG_ID = tag_dimension.TAG_ID
LEFT JOIN type_dimension ON fact_table.TYPE_ID = type_dimension.TYPE_ID
GROUP BY MEME_TYPE;
/* ANSWER: See output */
/* 6. How many tags on average are assigned to meme entries? */
SELECT (AVG(LENGTH(tags) - LENGTH(REPLACE(tags, ',', '')))) AS avg_tags
FROM additional_analysis_table;
/* ANSWER: 7.3976293103448276 */
/* 7. What meme had the most long-lasting search interest? */
SELECT TITLE,
(SPREAD_YEAR_MAX - SPREAD_YEAR_MIN) AS SPREAD_DURATION
FROM additional_analysis_table
WHERE SPREAD_YEAR_MIN <> 'NaN'
ORDER BY SPREAD_DURATION DESC;
/* ANSWER: "Super S Stussy" */
/* 8. What are the top 5 external sources that meme entries get spread over most frequently? */
SELECT UNNEST(STRING_TO_ARRAY(SPREAD_REFERENCES, ',', '')) AS SPREAD_REF,
COUNT(*) AS COUNT
FROM additional_analysis_table
WHERE SPREAD_REFERENCES IS NOT NULL
GROUP BY SPREAD_REF
ORDER BY COUNT DESC
LIMIT 5;
/* ANSWER: Reddit, Tumblr, Facebook, Urban Dictionary, Twitter */
/* 9. What objects are commonly present in meme entries having high unsafe search likelihood? */
/* 10. What tags are associated with a decrease in the meme image's safety property in various categories (e.g. adult, spoof, medical, violence, racy)? */
SELECT TITLE,
(SAFESEARCH_ADULT + SAFESEARCH_MEDICAL + SAFESEARCH_RACY + SAFESEARCH_SPOOF + SAFESEARCH_VIOLENCE) / 5 AS UNSAFETY,
MEME_TYPE,
TAGS
FROM additional_analysis_table
WHERE SAFESEARCH_ADULT <> 'NaN'
ORDER BY UNSAFETY DESC
/* ANSWER: Among the tags of the first 10 memes, we notice 'kittiestitties4ever', 'rape', 'boobs, boobies, breasts', 'anal sex, horse sex' and 'violent, violence' */
/* 11. Show the number of the memes per year in ascending order. */
SELECT COUNT(*), date_dimension.YEAR
FROM fact_table
LEFT JOIN date_dimension ON fact_table.DATE_ID = date_dimension.DATE_ID
WHERE date_dimension.YEAR != 0
GROUP BY date_dimension.YEAR
ORDER BY count ASC;
/* ANSWER: see result. */
/* 12. What is the percentage of memes with male content vs female content (tags)? */
SELECT
(SELECT 100 * COUNT(*)::decimal /
(SELECT COUNT(*)::decimal
FROM additional_analysis_table
WHERE NOT TAGS ISNULL) AS FEMALE_PCT
FROM additional_analysis_table
WHERE (TAGS SIMILAR TO '(%woman%)|(%female%)|(%girl%)'))
AS FEMALE_PCT,
(SELECT 100 * COUNT(*)::decimal /
(SELECT COUNT(*)::decimal
FROM additional_analysis_table
WHERE NOT TAGS ISNULL) AS MALE_PCT
FROM additional_analysis_table
WHERE (TAGS SIMILAR TO '(%man%)|(%male%)|(%boy%)'
AND NOT TAGS SIMILAR TO '(%woman%)|(%female%)|(%girl%)'))
AS MALE_PCT;
/* ANSWER: 2.4% feature something female and 10.5% feature something male*/
/* 2.2. Find the most frequent DBpedia schema types. */
SELECT DBSCHEMA,
COUNT
FROM
(SELECT UNNEST(STRING_TO_ARRAY(TYPES_SCHEMA, ',', '')) AS DBSCHEMA,
COUNT(*) AS COUNT
FROM additional_analysis_table
WHERE TYPES_SCHEMA IS NOT NULL
GROUP BY DBSCHEMA) AS SUBQUERY1
WHERE DBSCHEMA IS NOT NULL
ORDER BY COUNT DESC;
/* ANSWER: Most frequent are CreativeWork, followed by WebSite and Person*/