forked from kiranvodrahalli/cos521
-
Notifications
You must be signed in to change notification settings - Fork 0
/
paper.tex
640 lines (458 loc) · 43.3 KB
/
paper.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Journal Article
% LaTeX Template
% Version 1.3 (9/9/13)
%
% This template has been downloaded from:
% http://www.LaTeXTemplates.com
%
% Original author:
% Frits Wenneker (http://www.howtotex.com)
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%----------------------------------------------------------------------------------------
% PACKAGES AND OTHER DOCUMENT CONFIGURATIONS
%----------------------------------------------------------------------------------------
\documentclass[twoside]{article}
\usepackage{mathtools}
\usepackage{amsmath}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\usepackage{amsthm}
\newtheorem{claim}{Claim}
\usepackage{graphicx}
\graphicspath{ {figures/} }
\usepackage[usenames,dvipsnames]{color} % Required for specifying custom colors and referring to colors by name
\definecolor{MyRed}{rgb}{0.6, 0.0, 0.0}
\definecolor{MyGreen}{rgb}{0.0,0.4,0.0} % Comment color
\definecolor{MyBlue}{rgb}{0.0, 0.0, 0.6}
%\setlength\parindent{24pt}
\usepackage[pdftex]{hyperref} % For hyperlinks in the PDF
\hypersetup{
colorlinks=true,
linkcolor=MyBlue,
citecolor=MyRed,
urlcolor= MyGreen
}
% make hyperlinks bold
\newcommand{\aref}[1]
{\textbf{\autoref{#1}}}
\newcommand{\nref}[1]
{\textbf{\nameref{#1}}}
\newcommand{\cc}[1]
{\textbf{\cite{#1}}}
\usepackage{lipsum} % Package to generate dummy text throughout this template
\usepackage[sc]{mathpazo} % Use the Palatino font
\usepackage[T1]{fontenc} % Use 8-bit encoding that has 256 glyphs
\linespread{1.05} % Line spacing - Palatino needs more space between lines
\usepackage{microtype} % Slightly tweak font spacing for aesthetics
\usepackage[hmarginratio=1:1,top=32mm,columnsep=20pt]{geometry} % Document margins
\usepackage{multicol} % Used for the two-column layout of the document
\usepackage[hang, small,labelfont=bf,up,textfont=it,up]{caption} % Custom captions under/above floats in tables or figures
\usepackage{booktabs} % Horizontal rules in tables
\usepackage{float} % Required for tables and figures in the multi-column environment - they need to be placed in specific locations with the [H] (e.g. \begin{table}[H])
\usepackage{hyperref} % For hyperlinks in the PDF
\usepackage{lettrine} % The lettrine is the first enlarged letter at the beginning of the text
\usepackage{paralist} % Used for the compactitem environment which makes bullet points with less space between them
\usepackage{titlesec} % Allows customization of titles
%\renewcommand\thesection{\Roman{section}} % Roman numerals for the sections
%\renewcommand\thesubsection{\Roman{subsection}} % Roman numerals for subsections
\titleformat{\section}[block]{\large\scshape\centering}{\thesection.}{1em}{} % Change the look of the section titles
\titleformat{\subsection}[block]{\large}{\thesubsection.}{1em}{} % Change the look of the section titles
%----------------------------------------------------------------------------------------
% TITLE SECTION
%----------------------------------------------------------------------------------------
\title{\vspace{-15mm}\fontsize{24pt}{10pt}\selectfont\textbf{Estimating Trending Topics on Twitter with Small Subsets of the Total Data}} % Article title
\author{
\large
\textsc{Evan Miller}\\[2mm]
\textsc{Kiran Vodrahalli}\\[2mm]
\textsc{Albert Lee}\\[2mm]
\vspace{-5mm}
}
\date{January 13, 2015}
%----------------------------------------------------------------------------------------
\begin{document}
\maketitle % Insert title
%----------------------------------------------------------------------------------------
% ARTICLE CONTENTS
%----------------------------------------------------------------------------------------
\small{
\section{Introduction} \label{sec:Intro}
% introduction
Since its founding in $2006$, Twitter has grown into a corporation dominating a significant
proportion of social media today. In recent years, it has become interesting to analyze tweets
to determine interesting phenomena ranging from the viral spread of news to the detection of earthquakes \cc{Burks:2014}. The volume of existing tweets is prohibitively large for standard methods of analysis, and requires approaches that are able to either store large amounts of data in memory for fast access (including parallel approaches to data processing) or approaches that only require sublinear memory while retaining accuracy guarantees. In this paper, we investigate a problem utilizing the latter method.
\subsection{Problem Statement}
% give clear definition of problem and motivation here
We would like to create an online algorithm that provides a real-time estimate of the frequencies of hashtags on Twitter time series data.
Since we want to be able to run this algorithm in real time, we would like for it to be space-efficient: that is, it should not be required to store a large number of hashtag counts at any given time. Ideally, the algorithm's space complexity should be sublinear in the number of hashtags seen.
Therefore, our estimate should not require knowledge of the exact frequency history of hashtags in tweets.
More specifically, we will attempt to approximate the $k$ most popular Twitter hashtags in time intervals of length $y$ in order to tell what is trending during that time period.
The idea is that estimating the top $k$ hashtags in some time interval provides a model for the topics that are trending on Twitter in that time period.
\subsection{Previous Work} \label{sec:PreviousWork}
% describe past work in the areas here
Estimating frequencies with high probability and low space complexity has been an interest in
algorithms development for a fair amount of time. The Count-Min sketch, developed in the early $2000$s,
provided an approximate data structure able to maintain counts of keys seen in an online setting in space sublinear to the number of keys, with the size of the structure instead dependent on parameters tunable for different accuracy and guarantee settings \cc{Cormode:2005}.
More recently in $2012$, Matusevych et al. developed the Hokusai structure, which gives frequency counts for keys over a range of time \cc{Matusevych:2012}. The rough idea behind this approach is that in the distant past, we should only care about heavy-hitters, i.e. hashtags with high frequencies in order to estimate the likelihood that the hashtag is trending again. The goal of the time-aggregated Hokusai system is to store older data at decreased precision since the older data also has decreased value in the computation. The time-aggregated Hokusai system works by storing aggregate data in Count-Min sketches each with a $2^i$ day resolution for the past $2^i$ days. Each of these Count-Min sketches computes $d$ hash functions from $\{0,1\}^*$ to $\{0, 1, ..., m-1\}$.
Analyzing what is trending on Twitter is a more recent research area. Twitter itself must give its users hashtags which it deems trending, to both spread the hashtag and to provide an awareness of what is going on in the world to its users. Knowledge of what is trending and how that distribution looks over time is also useful for the business model, which relies on ads to a certain extent.
Twitter itself stores all of its frequency data, and looks at recent spikes while conditioning on all of the past frequency data in order to determine estimates of trending likelihood.
In $2012$, MIT Professor Devavrat Shah built a machine-learning algorithm to predict which topics will trend on Twitter an hour and a half before they do, with $95\%$ accuracy, though the datasets they tested their algorithm on were small \cc{MITNews2012}.
As a preliminary starting point for our approach to approximating the past frequencies, we will utilize concepts from the Hokusai paper to generalize the Count-Min Sketch scheme to time-series data \cc{Matusevych:2012}.
\subsection{Our Approaches}
Our approach was to explore whether or not adding historical data would improve the accuracy of
trending hashtag detection. To that end, we implemented two algorithms: one called the naive algorithm, the other called the history-sensitive algorithm. We wanted to test whether an algorithm that uses
the history (in condensed form) to better understand what is trending at the current time could outperform an algorithm with a very short term memory.
The reason why the history is important is as follows: Some hashtags are constantly present. For example, in our experiments, the hashtags $\#porn$, $\#gameinsight$, $\#teamfollowback$ had nearly always had a high frequency in the current time unit. This fact is not surprising given the nature of the first hashtag. The second hashtag, $\#gameinsight$, is the name of popular video game maker that people tweet when they win achievements in the game. Finally, $\#teamfollowback$ is a method Twitter users apply to gather Twitter followers quickly. The rule is if you follow that tag, you follow people who post that tag. Since these sorts of tags are consistently present, they should not be considered trending: they are not new. Therefore, we need to filter these out with the history.
The necessity of maintaining approximate counts for a longer history depends on exactly how constant these consistently-appearing tweets actually are.
We had two different algorithms for finding out which hashtags were trending in real time, both space efficient.
The naive algorithm does not take into account the history and simply looks at a $y$-unit before the current $y$-unit interval to find the current trending hashtags. To determine whether a hashtag was trending, we tested
\begin{equation}
\label{eq:threshold}
\frac{\textbf{freq}(\textnormal{hashtag}, \textnormal{present } y \textnormal{-unit})}{\textbf{freq}(\textnormal{hashtag}, \textnormal{past } y \textnormal{-unit})} > c
\end{equation}
where $c$ is a threshold we picked to determine whether the hashtag was considered trending. We believe that in order to accurately determine trending hashtags, we will need to use more of the past in order to eliminate the hashtags which are ever-present, and therefore not trending.
The second approach differs in that we do not store all the data of the past, but instead use several Count-Min Sketches to approximate the past frequencies. We call this approach the history-sensitive algorithm.
The idea is to remember the past with less accuracy to save space, while weighting the frequency counts of a hashtag in the past appropriately. We used the rule of thumb that the more often a hashtag shows up in the past, the less likely it is trending. However, we also discount the importance of the past by weighting the times long ago less and less as time moves forward.
%------------------------------------------------
\section{Describing the History-Sensitive Algorithm and Data Structure}
% how did we approach it
We separate the problem of finding trending topics on Twitter into two parts. First, we need to maintain a data structure that efficiently stores data about all occurrences of every hashtag seen in the past. We also maintain a separate data structure that allows us to quickly gather information about the most recent hashtags seen.
We want the former data structure to be very space efficient since it must store data about a very large dataset. For this structure, space efficiency is more important than accuracy since small deviations in such a large dataset should not be significant because the deviations in past data should not greatly affect what is currently trending.
For the latter data structure, accuracy is more important than space efficiency since the structure contains data which more closely relates to which topics are currently trending and the size of the dataset is much smaller.
\subsection{Data Structures}
% -- we maintain datastructure in two parts: History, and Present
\subsubsection{History Data Structure}
To store data about all occurrences of every hashtag seen in the past, we use a modified version of the time-aggregated Hokusai system \cc{Matusevych:2012}, which is an extension of the Count-Min sketch. We already described this data structure in \nref{sec:PreviousWork}.
To the Hokusai structure we add another Count-Min sketch that combines the information from the Hokusai Count-Min sketches. We call this external Count-Min sketch the Kernel, since it acts as a weighting function on the CM sketches in the Hokusai structure. Its role is to depreciate the value of older data.
We denote the CM sketches of the Hokusai structure as a vector $\textbf{M} = \{\overline{M}, M^0, M^1, ..., M^{{\lfloor {\log(T)} \rfloor}}\}$, where $\overline{M}$ is the Count-Min sketch storing the data of a unit time step (of size $z$), $M^j$ is the Count-Min sketch storing the data for the sketch with a $2^j$ time resolution, and $T$ is an upper bound on the number of $z$-units stored by the data structure. Then, the kernel function is given by
\begin{equation}
\label{eq:kernel}
\textit{k}(\textbf{M}) = \overline{M} + \sum\limits_{j=0}^{{\lfloor \log(T) \rfloor}} \frac{M^j}{2^j}
\end{equation}
For each hashtag, the kernel sketch stores a positive value that approximates how often the hashtag showed up in the past. In \nref{sec:Correctness} we will show that this aggregate Count-Min sketch weights the hashtags that occurred $i > 0$ days ago with weight approximately $\frac{1}{i}$.
We will refer to the combined data structure of Hokusai and the Kernel as the History.
\subsubsection{Current-Window Data Structure}
Our data structure for holding hashtags seen in the last $y$-unit consists of three components: a max Fibonacci heap, a queue, and a hash table. We refer to these components collectively as the Current-Window.
The keys for the hash table are the hashtags, and the values stored in the table are (frequency, pointer to corresponding node in heap) pairs. The hash table exists so that we can keep track of exact frequency counts for each $y$-unit.
The queue contains (hashtag, timestamp) pairs, each of which is inserted upon seeing a hashtag in the input stream. The purpose of the queue is to determine when a hashtag count is too old to be maintained in the present $y$-unit window.
Finally, we have a Fibonacci heap to prioritize the hashtags based on how trending they currently are.
We provide a heap priority function, which defines what trending means for the whole data structure.
For a specific hashtag, we define the priority function to be
\begin{equation}
\label{eq:priority}
\textbf{priority}(\textnormal{hashtag}) = \frac{\textbf{freq}(\textnormal{hashtag}, \textnormal{present } y \textnormal{-unit})}{\textbf{History}[\textnormal{hashtag}]}
\end{equation}
The idea is that again, we want to be inversely proportional to the history, here corrected by more knowledge and clever approximation schemes to save space.
\subsection{Algorithm Pseudocode}
\subsubsection{Updating History Data Structure}
Algorithm \aref{alg1} describes the necessary steps to maintain the History data structure as new input is provided. We perform time aggregation to fill the Kernel, $K$. This algorithm is only called when a $z$-unit has passed since the last time the algorithm was called.
\begin{algorithm}
\caption{Update History} \label{alg1}
\begin{algorithmic}[1]
\ForAll {$i$}
\State Initialize Count-Min sketch $M^i = 0$
\EndFor
\State Initialize $t = 0$
\State Initialize Count-Min sketches $\overline{M} = 0$ and $K = 0$
\While {data arrives}
\State Aggregate data into sketch $\overline{M}$ for current $z$-unit
\State $t \leftarrow t + 1$ (increment counter)
\For {$j = 0$ to argmax \{$l$ where $t$ mod $2^l = 0$\}}
\State $K \leftarrow K + 2^{-j}(\overline{M} - M^j)$ (update the Kernel)
\State $T \leftarrow \overline{M}$ (back up temporary storage)
\State $\overline{M} \leftarrow \overline{M} + M^j$ (increment cumulative sum)
\State $M^j \leftarrow T$ (new value for $M^j$)
\EndFor
\State $\overline{M} \leftarrow 0$ (reset aggregator)
\EndWhile
\end{algorithmic}
\end{algorithm}
\subsubsection{Updating heap and hash tables}
We run Algorithm \aref{alg2} in the main loop, every time a single piece of data arrives.
For each new piece of data, we also check whether the data we are storing has grown
outdated -- that is, older than a $y$-unit before the most current piece of data arrived.
\begin{algorithm}
\caption{Update Current-Window} \label{alg2}
\begin{algorithmic}[1]
\While {data arrives}
\If {the current $z$-unit is different than that of the last hashtag seen}
\State Do all the end-of-$z$ aggregation for the History structure as detailed in
Algorithm \aref{alg1}.
\ForAll {elements in the Fibonacci heap}
\State look up the hashtag corresponding to this node in the hash table
\State Update the key of the node to: $\textbf{priority}(\textnormal{hashtag})$
\EndFor
\EndIf
\If {queue is not empty}
\State Peek at end of queue.
\If {the timestamp + $y$ is before the current time}
\State Look up the hashtag in the hash table and decrement the stored frequency.
\If {the frequency is now 0}
\State Delete the node in the heap pointed to by this entry in the table.
\State Delete this entry in the hash table.
\Else
\State Update the key of the node with the new frequency.
\EndIf
\EndIf
\EndIf
\If {hashtag is in hash table}
\State Increment the frequency stored at that entry.
\State Update the key of the node in the Fibonacci heap.
\Else
\State Insert a new node into the Fibonacci heap with the appropriate key and value.
\State Insert the hashtag into the hash table with a pointer to this node.
\EndIf
\EndWhile
\end{algorithmic}
\end{algorithm}
\subsubsection{Finding the trending hashtags}
We use Algorithm \aref{alg3} to calculate the top $k$ hashtags by querying the heap. The heap is the structure responsible for maintaining the priority of the various hashtags, and provides an efficient way of implementing a changing priority queue.
\begin{algorithm}
\caption{Top $k$ trending hashtags} \label{alg3}
\begin{algorithmic}[1]
\State Perform $k$ delete-max operations on the Fibonacci heap, storing each of the nodes deleted in a list $L$.
\ForAll {nodes $N$ in $L$}
\State Announce that the hashtag associated with $N$ is trending.
\State Insert $N$ into the Fibonacci heap.
\EndFor
\end{algorithmic}
\end{algorithm}
%------------------------------------------------
\section{Analyzing the History-Sensitive Algorithm}
\begin{table}[h]
\centering
\begin{tabular}{@{}ll@{}}
\toprule
Symbol & Meaning \\ \midrule
$d$ & Number of hash tables for each Count-Min sketch \\
$m$ & Size of each hash table in each Count-Min sketch \\
$s$ & Number of distinct hashtags in the Current-Window \\
$z$ & Time resolution of the unit-sized Count-Min sketches \\
$T$ & Upper bound on the number of $z$-units of data stored in History \\
$x$ & Total number of hashtags in the Current-Window \\
$y$ & Time interval of data contained in the Current-Window \\ \bottomrule
\end{tabular}
\caption{Variables referenced in this section}
\end{table}
\newpage
\subsection{Correctness} \label{sec:Correctness}
Our history-sensitive algorithm finds the $k$ hashtags that have the maximum value of
$
\frac{\textbf{freq}(\textnormal{hashtag}, \textnormal{present } y \textnormal{-unit})}{\textbf{History}[\textnormal{hashtag}]}
$.
\begin{claim}
The value for hashtag $x$ in the aggregate Count-Min sketch of the History data structure is within a factor $4$ of $\overline{M}(x) + \sum\limits_{i=1}^T \frac{1}{i}*$ (value for $x$ in a Count-Min sketch using the same hash functions for all hashtags occurring $i$ $z$-units ago).
\end{claim}
\begin{proof}
First, we use Theorem $4$ in \cc{Matusevych:2012} which states that ``At $t$, the sketch $M^j$ contains statistics for the period $[t - \delta, t - \delta - 2^j]$ where $\delta = t \mod 2^j$.''
Let $b$ be the location in the aggregate Count-Min sketch containing the value returned when $x$ is queried.
Let $h$ be any instance of any hashtag that appeared on $z$-unit $p$ such that seeing $h$ incremented counters in position $b$.
\bigskip
\noindent $\textbf{Case 1}$: $p=t$
Then, the statistics for $h$ are recorded in $\overline{M}$ and are not in any $M^j$.
\noindent $\textbf{Case 2}$: $2^i > t-p \geq 2^{i-1}$ for some $i > 0$
By Theorem 4, for all $j \leq i - 2$, $M^j$ does not contain statistics for $p$ since $p \leq t - 2^{i-1} \leq t - \delta - 2^{i-2}$.
\bigskip
Therefore, the increments that occurred in the Count-Min sketch for hashtags occurring $i$ $z$-units ago contribute at most $\sum\limits_{j = i - 1}^{T} 2^{-j} < 2^{2-i}$ to the value in position $b$.
Let $q$ be the largest $j$ such that $t - \delta - 2^j \geq p$.
Then $p \leq t - \delta - 2^q$. Let $\lambda = t \mod 2^{q+1}$. Then $\lambda = \delta$ or $\lambda = \delta + 2^q$.
Since $q$ is the largest $j$ such that $t - \delta - 2^j \geq p$, $p > t - \lambda - 2^{q+1}$.
Also, $p \leq t - \delta - 2^q \leq t - \lambda$, so $M^{q+1}$ contains statistics about $h$.
For all $j \geq i$, $t - \delta - 2^j < t - 2^j < p$, so $q \leq i - 1$.
Thus, incrementing the counter in $M^{q + 1}$ contributed at least $2^{-i}$ to the value in position $b$.
Thus, the contributions to the sum are within a factor $4$ of $\frac{1}{t-p}$.
Therefore, summing over all hashtags that increment counters in position $b$ gives $\overline{M}(x)$ for all hashtags that occurred on $z$-unit $t$, and within a factor $4$ of $\sum\limits_{i=1}^T \frac{1}{i}*$(value for $x$ in a Count-Min sketch using the same hash functions for all hashtags occurring $i > 0$ $z$-units ago).
\end{proof}
Given this proof and the fact that Count-Min sketches return approximate frequency counts, we know the priority function to be approximately:
\begin{equation}
\label{eq:approxpriority}
\textbf{priority}(\textnormal{hashtag}) \approx \frac{\textbf{freq}(\textnormal{hashtag}, \textnormal{present } y \textnormal{-unit})}{\textbf{freq}(\textnormal{hashtag}, \textnormal{present } z \textnormal{-unit}) + \sum\limits_{i=1}^T \frac{1}{i}* \textbf{freq}(\textnormal{hashtag}, i \ z \textnormal{-units ago})}
\end{equation}
%This value is approximately (frequency in last $y$-unit) / (freq current $z$-unit + $\sum\limits_{i=1}^T \frac{1}{i} *$ (frequency of hashtag $i$ $z$-units ago)).
This seems to be a desirable function to maximize since it finds hashtags that are common in the last $y$-unit that have been comparatively infrequent in the past. This function is good since it especially emphasizes hashtags that are different than those seen in the past few $z$-units. This ensures that the same items do not stay trending for too long.
\subsection{Runtime Analysis}
We analyze the time require for processing the insertion of a hashtag. It takes amortized $O(d)$ time to update the History. It takes expected $O(1)$ time to check if the hashtag is in the hash table.
If it is, it requires $O(1)$ time to increment the frequency, $O(d)$ time to compute the new key, and $O(1)$amortized time to update the key since it will be non-decreasing.
Otherwise, it requires $O(d)$ time to compute the key value, $O(1)$ time to insert the new node in the heap, and $O(1)$ time to insert into the hash table.
Thus, our algorithm takes $O(d)$ amortized time $+$ expected $O(1)$ time to process a new hashtag.
\begin{table}[h]
\centering
\label{my-label}
\begin{tabular}{@{}ll@{}}
\toprule
Case & Amortized Time \\ \midrule
Processing the insertion of a new hashtag & $O(d)$ \\
\begin{tabular}[c]{@{}l@{}}Processing the removal of a hashtag from \\ the Current-Window\end{tabular} & $O(d + \log(s))$ \\
\begin{tabular}[c]{@{}l@{}}Updating History and Current-Window \\ at the end of a $z$-unit\end{tabular} & $O(md + ds + s\log(s))$ \\
Querying for the top $k$ trending items & $O(k\log(s))$ \\ \bottomrule
\end{tabular}
\caption{Time analysis summary}
\end{table}
Next we analyze the required time to process the removal of a hashtag from the Current-Window. It takes $O(1)$ time to verify that the queue is not empty. It takes $O(1)$ time to peek at the end of the queue and verify that the timestamp $ + y$ is before the current time. It takes $O(1)$ time in expectation to look up this hashtag and decrement its frequency. Then, it takes $O(1)$ time to check if the frequency is $0$.
If so, it takes $O(\log(s))$ amortized time to delete the node in the heap and $O(1)$ time to delete the entry in the hash table.
Otherwise, it takes $O(d)$ amortized time to compute the new key for the hash table and $O(\log(s))$ amortized time to update the heap given this key.
Thus, our algorithm requires $O(\log(s))$ amortized time $+$ expected $O(1)$ time $+ O(d)$ time to remove a hashtag from the Current-Window.
Now we analyze the time due to the end-of-day updates to the History and the resulting updates to the heap. By Lemma $5$ in \cc{Matusevych:2012}, the amortized time required to do all end-of-day aggregation is $O(md)$. Then, for each of the $s$ nodes in the heap, it takes $O(d)$ time to compute each updated key and $O(\log(s))$ amortized time to update the heap given the new key.
Thus, it takes $O(md + s\log(s))$ amortized time $+ O(ds)$ time to do all necessary updates at the end of the day.
Querying the data structure for the top $k$ trending items takes $O(k\log(s))$ amortized time for $\textbf{delete-max}$ operations, $O(k)$ time to announce that these items are trending, and $O(k)$ amortized time to reinsert these nodes into the heap.
Thus, it takes $O(k\log(s))$ amortized time to determine what is trending.
\subsection{Spatial Analysis}
The History requires $O(md)$ space for each Count-Min sketch, so it requires a total of $O(md\log(T))$ space.
Each node in the heap requires a constant amount of space, so the heap requires $O(s)$ space.
The hash table always contains at most $s$ entries with each entry requiring a constant amount of space. Also, in order to maintain an expected $O(1)$ lookup time, the hash table needs to have $O(s)$ bins. Thus, the hash table requires $O(s)$ space.
The queue requires an entry for every hashtag seen in the last $y$-unit, so it requires $O(x)$ space.
Thus, everything requires $O(md\log(T) + x)$ space since $s < x$.
%------------------------------------------------
\section{Design Choices}
% y is 3 hours, z is 1 day
\subsection{Choosing the parameters $y$ and $z$}
For $y$, the unit time period we attempted to find the trending hashtags for, we chose a $3$-hour interval.
For $z$, the unit time period for the History data structure, we chose day-length intervals. These values made sense because the testing data we used occurred over a $30$-day interval, and we wanted to avoid too fine a temporal resolution so that we could run our code on the data in a reasonable amount of time.
\subsection{Parameters of the History Data Structure}
Since $T = 30$ (our data is for the month of September $2014$, therefore $\sim 2^{(6-1)}$ days is definitely enough), we store $6$ Count-Min
sketches in the Hokusai portion of the History data structure, with an extra Count-Min
sketch for the Kernel.
Each of these CM sketches has $m = 3500$ and $d = 20$. From the proof of CM sketch, we have that
\begin{equation}
\label{eq:error_control}
m = \lceil \frac{e}{\epsilon} \rceil
\end{equation}
and
\begin{equation}
\label{eq:certainty_control}
d = \lceil \textnormal{ln}(\frac{1}{\delta}) \rceil
\end{equation}
where the error in the query is in factor of $\epsilon$ with probability $1 - \delta$. By Theorem $1$ in \cc{Matusevych:2012}, we have that if $n_x$ is the true count of a hashtag stored in the CM sketch, and $c_x$ is the estimate, we are guaranteed $n_x \leq c_x \leq n_x + \epsilon \sum_{x'} n_{x'}$.
By choosing $m = 3500$, we have that $\epsilon = \frac{e}{3500}$ and $1 - \delta = 1 - \frac{1}{e^{20}}$, which is to say that we are within that error with very high probability. The error term is a multiplier on the total number of hashtags, of which there are $\sim 15$ million, as an upper bound. Thus we are within very high probability of the true count with range $\sim 12000$, on a given count.
By using less memory we increase the amount of error. These error terms are added to the denominator of our priority function, and as a result, they prevent hashtags with very relatively low frequency from being identified as trending, which is a very desirable property.
\subsection{Choosing the Flavor of Heap}
We chose a Fibonacci heap in the hopes that there would be a larger number of $\textbf{decrease-key}$ operations, which has a better theoretical bound in Fibonacci heaps.
%------------------------------------------------
\section{Testing Procedure}
We downloaded the \href{https://archive.org/details/twitterstream}{September $2014$ archive of Twitter data } to serve as test data for our algorithms \cc{Twitter2014}. We then parsed the data into (timestamp, hashtag) pairs
for all thirty days worth of data. Using these files as pseudo-real-time input, we used the last week of the data as a benchmark for comparison between the naive and the history-sensitive algorithms. We divided the data into three-hour chunks, and queried for the top ten hashtags in each of the three-hour time blocks
in the last week of the data using both algorithms. Note that the history-senstive algorithm was required to parse through the first three weeks as well in order to build its history, whereas the naive algorithm was only required to pay attention to six-hour chunks at a time, and thus never needed to look a bit beyond the scope of that week.
We used the Jaccard similarity to arrive at a similarity score between the two methods. We also investigated the hashtags that both algorithms came up with for a given $3$-hour interval, and verified
that they were in fact trending at that time using news and Twitter media.
%------------------------------------------------
\section{Results}
\subsection{Runtime Measurements}
We use the space complexities of the data structures and the sizes of the
respective inputs to determine approximately how much space is used by each
approach, and compare them. The naive algorithm's runtime on one week's worth
of hashtags was $\sim 15$ minutes. Note that the naive algorithm did not have
to take into account the previous $3$ weeks, since it only ever depended on
the frequencies from $3$ hours before the present time. The history-sensitive
algorithm on the other hand first had to build a history from the first three
weeks of the month before running on the last week for comparison to the naive
algorithm. The total running time for the history-sensitive algorithm was $
\sim 2$ hours.
\subsection{Are the Algorithm Outputs Actually Trending?}
\subsubsection{The Intersection Distribution}
First, we see how many hashtags were marked as trending
by both the naive and history-sensitive algorithms in the same time slots.
For each $3$-hour interval, we calculate the Jaccard Similarity, a value
in the range $[0, 1]$, given by
\begin{equation}
\label{eq:jaccard}
J(A, B) = \frac{|A \cap B|}{|A \cup B|}
\end{equation}
\noindent where $A$ and $B$ are the sets of top $10$ hashtags for a given $3$-hour interval
for each algorithm.
We then calculate the mean and standard deviation of the Jaccard similarity
over the $56$ different $3$-hour intervals of the last week of September $2014$.
This value gives us a metric for understanding how similarly the two algorithms performed.
We plot this data in the histogram shown in \aref{fig:jaccard_sims}. The $x$-axis is the Jaccard similarity, and the $y$-axis is the count. The mean was $0.47$, and the standard deviation was $0.18$. From the plot, we can see that the distribution of the Jaccard similarities is roughly a centered normal distribution around $0.5$. A Jaccard similarity of $0.5$ means about half the hashtags are the same in each $3$-hour time interval between the two algorithms.
\begin{figure}
\centering
\includegraphics[scale = .4]{jaccard_sims}
\caption{Distribution of Jaccard Similarities}
\label{fig:jaccard_sims}
\end{figure}
\subsubsection{Checking the Algorithm-Defined Trends Against Real Life}
In \aref{fig:news_vis}, we have superimposed pictures from
news articles that the tweets describe on a time axis, as
the hashtag referencing the article starts trending. The Jaccard similarities
for each of these three-hour intervals were all above $0.5$, so both algorithms
found the hashtags that we graph in the figure.
We checked that the news was released (via Yahoo! News, Twitter, Google News, ESPN, and others)
at around the time our algorithms say the hashtag relating to the news
begins to trend. A fair number of the trending hashtags are related to sports and TV shows.
As we can see in the figure, our algorithm detected the start of the TV show "How to Get Away With Murder", which was released on September $25$, $2014$. Another TV-related hashtag was $\#dailyshowgonetoofar$, when Jon Stewart called One Direction member Zayn Malik a terrorist (on the $26^{th}$) and caused a huge outrage. Sports events that occur real-time on TV also have the trending profile. There were three in this section of six three-hour intervals. First, Derek Jeter hit a walk-off homerun in the final time at bat on the Yankees, and naturally on Twitter, fans were excited. This event occurred on the $25^{th}$. The Ryder Cup is a major golf tournament that began on the $26^{th}$ and lasted through the $28^{th}$, and its beginning launched a wave of tweets. The last sports-related hashtag was the beginning of the ToyotaEMR car race that took place in India on September $26^{th}$. Finally, $\#blowthewhistle$ related to a kids cancer campaign that had lasted throughout September, which perhaps got a boost as September ended to raise some last minute funds.
\begin{figure}
\centering
\includegraphics[scale = .4]{Timeline}
\caption{News Visualization of Trending Tweets}
\label{fig:news_vis}
\end{figure}
%------------------------------------------------
\section{Discussion}
\subsection{Comparison of Naive and History-Sensitive Algorithms}
Based on the approximately normal, centered distribution of the Jaccard similarities, we observe that the history-sensitive approach performed about as well as the naive approach in terms of getting trending tweets. However, the naive algorithm uses considerably less space and time (since we only need to check the
$y$-unit before the current $y$-unit) to get similar results. Thus we conclude that the space efficiency of the naive approach outweighs the benefits of the history-sensitive approach for this particular task. Furthermore, in our development of the history-sensitive algorithm, we utilized larger errors as a method of smoothing to perform decently, suggesting that our approach for evaluating the history may not be the best one for tweet-trending detection.
However, we note that from inspection the history-sensitive algorithm has more staying power than
the naive approach. While in the naive approach, it is rare for adjacent $y$-units to have the same
trending tweets, it is a more common occurrence in the history-sensitive approach when it makes sense (for instance, there was an event going on for the whole day: the naive algorithm would immediately reduce showing the hashtags related to that event, while the history-sensitive algorithm would not discard that
hashtag so easily).
Because Twitter limits access to its data, the datasets on which we tested our algorithms only contained 1\% of the total number of tweets. If we were to run our algorithms on the entire dataset, we would expect that the space efficiency of the history-sensitive algorithm would improve relative to the naive algorithm because the naive algorithm would need to store 100 times more data while the history-sensitive algorithm could just reduce the size of the $y$-units to keep it nearly as space efficient. We would not expect this to greatly affect the history-sensitive algorithm's accuracy because we could still process the same number of recent tweets to compare against the History structure. Thus, on the full dataset, we could still reasonably expect the history-sensitive algorithm to be at least as space efficient as the naive algorithm.
\subsection{Applicability to Real-Time Data}
In this paper, we only simulated our algorithm running on live data, since we did not
have good access to the live Twitter firehose. In terms of performance, despite being overshadowed
by the simpler naive method due to their similar behavior (capturing a fair number of the same hashtags),
the history-sensitive algorithm did decently well. It took a few hours to run our algorithm
on a $15$ million hashtag dataset, with relatively large data structures to be updated and run in Python no less, a slower language.
Were we to apply our algorithm to real-world real-time data, we would re-implement our code into C or another space and time efficient language.
Furthermore, for real data from all of the Twitter firehose, we would require a better hardware infrastructure dedicated to processing all the data. Ideally we would use distributed systems and would
parallelize our code before running it. The fact that the code is broken up into disparate parts makes it easier to implement in parallel.
Therefore, given some adjustments, the algorithms developed in this paper open the doors to new explorations of topic modeling the Twitter data stream.
%------------------------------------------------
\section{Future Work} \label{sec:Future Work}
With respect to the algorithms specifically, in the future we would want to adapt our approach to the history-sensitive algorithm and play with the parameters a bit more. We would also like to try different kernel functions to see how they performed. Another approach to utilizing the history information would be instead of weighting by inverse-exponentially-weighted frequency in the past for a given hashtag, attempt to calculate some score of periodicity for the hashtag. If the hashtag has low periodicity over a given time scale, then even if it appears multiple times in the past, our algorithm will avoid discarding it as non-trending despite the potentially high frequency count (because the past is long).
Then, we would like to try adapating our algorithms to real-time data: that is, hook up our algorithms to a significant portion of the Twitter firehose, after setting up the necessary hardware to process that amount of data. In order to aid with the processing, we would also ideally use a distributed system and parallelize
our code for better performance.
We also know that an estimation of the top $k$ hashtags in a given time interval could provide a topic model for tweets. It could be interesting to use the top $k$ hashtags to build a time-sensitive topic model and analyze quantities including the speed of topic change, the variety of topics, and so on.
Even though it is perhaps not suited for the specific task of identifying trending hashtags,
the approach of the history-sensitive algorithm may be useful in other domains.
Future work could also investigate the performance of the history-sensitive algorithm on other data streams of interest, for instance, Wikipedia edits -- our goal would be to estimate which Wikipedia topics are being edited the most at any given time interval of some length. Another interesting application of this algorithm to test would be the real-time estimatation of the hottest selling stocks on Wall Street to see if either of these settings results in a significant improvement over the naive algorithm.
\section{Appendix I: Code and Visualizations} \label{sec:Appendix_code_viz}
We provide a link to \href{https://github.com/kiranvodrahalli/cos521/}{all our code}. In the repository, there are also the results files, Powerpoint slides, and citations for the pictures we used.
We also provide a link to an online hosting of the \href{http://www.princeton.edu/~awlee/trends.html}{frequency visualization}. The frequency visualization is a histogram of hashtag frequencies, displayed over time. We cherry-picked a set of interesting hashtags based on our observations, and then made this visualization to present the changes in frequency in $3$-hour intervals over time of these hashtags.
The fact that these frequencies change over time and spike and fall in turns suggests that
when the hashtag frequencies spike, the hashtags are trending. The goal of our algorithms is to identify the hashtags (of a much larger set) which are trending at a given point in time.
So we hope that for a selected $3$-hour interval, the hashtags that our algorithms say are trending include the ones that are trending at that time in this visualization.
In our code, we made use of two libraries written by other people, which we modified. One of these was a Count-Min sketch implementation in Python, and another was a Fibonacci heap implementation, also in Python. These program files are in our codebase, with appropriate citations at the top of each document. Furthermore, we used multiple standard Python libraries in our code.
\section{Appendix II: Top-$k$ Hashtags Results} \label{sec:Appendix_topk}
To see all $56$ $3$-hour intervals for both algorithms, see the results folder of our \href{https://github.com/kiranvodrahalli/cos521/}{Github}. We display the top $10$ hashtags for each $3$-hour interval for both the naive algorithm and the history-sensitive algorithm, along with their heap priorities. Note that the heap priorities span a reasonable range from $0$ to $1$, and are not clustered around $1$ or $0$, implying the algorithms are actually differentiating the hashtags as trending and not trending.
}
%----------------------------------------------------------------------------------------
% REFERENCE LIST
%----------------------------------------------------------------------------------------
\begin{thebibliography}{99} % Bibliography - this is intentionally simple in this template
\bibitem[Burks2014]{Burks:2014}
Burks, L., Miller, M., and Zadeh, R. (2014).
\newblock RAPID ESTIMATE OF GROUND SHAKING INTENSITY BY COMBINING SIMPLE EARTHQUAKE CHARACTERISTICS WITH TWEETS.
\newblock {\em Tenth U.S. National Conference on Earthquake Engineering Frontiers of Earthquake Engineering}. Retrieved from \href{http://www.stanford.edu/~rezab/papers/eqtweets.pdf}{http://www.stanford.edu/~rezab/papers/eqtweets.pdf}
\bibitem[Cormode2005]{Cormode:2005}
Cormode, G., and Muthukrishnan, S. (2005).
\newblock An improved data stream summary: The count-min sketch and its applications.
\newblock {\em Journal of Algorithms, 55}(1), pp. 58-75.
\bibitem[MITNews2012]{MITNews2012}
Hardesty, Larry.
\newblock {\em "Predicting What Topics Will Trend on Twitter."}
\newblock MIT News Office. MIT, 1 Nov. 2012. Web. 13 Jan. 2015. \href{http://newsoffice.mit.edu/2012/predicting-twitter-trending-topics-1101}{http://newsoffice.mit.edu/2012/predicting-twitter-trending-topics-1101}.
\bibitem[Matusevych2012]{Matusevych:2012}
Matusevych, S., Smola, A., and Ahmed, A. (2012).
\newblock Hokusai--Sketching Streams in Real Time.
\newblock {\em UAI '12: 28th conference on Uncertainty in Artificial Intelligence}, pp. 594-603.
\bibitem[Twitter2014]{Twitter2014}
Twitter, Inc. (2014).
\newblock {\em Archive Team JSON Download of Twitter Stream 2014-09}.
\newblock Retrieved from \href{https://archive.org/details/twitterstream}{https://archive.org/details/twitterstream}.
\end{thebibliography}
%----------------------------------------------------------------------------------------
\end{document}