forked from lrargerich/Apunte
-
Notifications
You must be signed in to change notification settings - Fork 0
/
apunte.toc
751 lines (751 loc) · 54.9 KB
/
apunte.toc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
\contentsline {part}{I\hspace {1em}Data Science}{1}
\contentsline {chapter}{\numberline {1}Fundamentos}{3}
\contentsline {section}{\numberline {1.1}Introducci\IeC {\'o}n}{3}
\contentsline {subsection}{\numberline {1.1.1}Hacer una pregunta interesante}{4}
\contentsline {subsubsection}{Regresi\IeC {\'o}n}{4}
\contentsline {subsubsection}{Clasificaci\IeC {\'o}n}{5}
\contentsline {subsubsection}{Clustering}{6}
\contentsline {subsubsection}{Recomendaciones}{8}
\contentsline {subsubsection}{Sistemas de Consulta}{9}
\contentsline {subsubsection}{Identificaci\IeC {\'o}n de Patrones}{9}
\contentsline {subsection}{\numberline {1.1.2}Conseguir los datos necesarios}{10}
\contentsline {subsubsection}{Obtener los Datos}{10}
\contentsline {subsubsection}{Depurar los Datos}{11}
\contentsline {subsubsection}{Detectar y eliminar Anomal\IeC {\'\i }as}{12}
\contentsline {subsubsection}{Valores Faltantes}{13}
\contentsline {subsection}{\numberline {1.1.3}Explorar los datos}{14}
\contentsline {subsubsection}{Analizar la estructura de los datos}{14}
\contentsline {subsubsection}{Navegar los datos}{14}
\contentsline {subsubsection}{Resumen estad\IeC {\'\i }stico}{15}
\contentsline {subsubsection}{Visualizar los datos}{15}
\contentsline {subsection}{\numberline {1.1.4}Aplicar a los datos el algoritmo necesario}{18}
\contentsline {subsection}{\numberline {1.1.5}Comunicar los resultados}{18}
\contentsline {section}{\numberline {1.2}Formatos de Datos}{18}
\contentsline {subsection}{\numberline {1.2.1}Data Frames}{18}
\contentsline {subsubsection}{Codificaci\IeC {\'o}n de Atributos Categ\IeC {\'o}ricos}{20}
\contentsline {subsubsection}{Codificaci\IeC {\'o}n de atributos categ\IeC {\'o}ricos en gran escala}{22}
\contentsline {subsection}{\numberline {1.2.2}Texto}{24}
\contentsline {subsubsection}{Bag of Words (BOW)}{24}
\contentsline {subsection}{\numberline {1.2.3}Datos Matriciales}{25}
\contentsline {subsubsection}{Matrices Dispersas}{25}
\contentsline {subsubsection}{CRS (Compressed Row Storage)}{26}
\contentsline {subsection}{\numberline {1.2.4}Im\IeC {\'a}genes}{26}
\contentsline {section}{\numberline {1.3}Niveles de Almacenamiento}{26}
\contentsline {subsection}{\numberline {1.3.1}Datos en Memoria}{27}
\contentsline {subsection}{\numberline {1.3.2}Datos en Disco}{27}
\contentsline {subsection}{\numberline {1.3.3}Datos en un Cluster}{28}
\contentsline {section}{\numberline {1.4}La Ley de Zipf y las Leyes de Potencias}{28}
\contentsline {subsection}{\numberline {1.4.1}Pareto, Zipf y Power-Laws}{29}
\contentsline {subsection}{\numberline {1.4.2}Leyes de Potencias}{33}
\contentsline {subsection}{\numberline {1.4.3}Propiedades de las Leyes de Potencias}{34}
\contentsline {subsubsection}{Necesidad de un valor m\IeC {\'\i }nimo}{35}
\contentsline {subsubsection}{Diferencias con una exponencial}{35}
\contentsline {subsubsection}{Dependencia de los eventos}{35}
\contentsline {subsubsection}{Invariancia a la escala}{36}
\contentsline {subsubsection}{Promedio y varianza}{36}
\contentsline {subsubsection}{Fen\IeC {\'o}meno del Cisne Negro}{36}
\contentsline {subsection}{\numberline {1.4.4}Origen de las leyes de potencias}{36}
\contentsline {subsection}{\numberline {1.4.5}Resumen}{36}
\contentsline {section}{\numberline {1.5}Algunos Elementos de Probabilidad y Estad\IeC {\'\i }stica}{37}
\contentsline {subsection}{\numberline {1.5.1}El Principio de Bonferroni}{37}
\contentsline {subsection}{\numberline {1.5.2}La Ecuaci\IeC {\'o}n mas peligrosa de la historia}{39}
\contentsline {subsection}{\numberline {1.5.3}Frecuentistas vs Bayesianos}{41}
\contentsline {subsubsection}{Estimaci\IeC {\'o}n Bayesiana con la Distribuci\IeC {\'o}n Beta}{43}
\contentsline {subsection}{\numberline {1.5.4}La \IeC {\'U}nica Ley sin Explicaci\IeC {\'o}n}{46}
\contentsline {subsection}{\numberline {1.5.5}Skewness}{47}
\contentsline {section}{\numberline {1.6}Algoritmos Aleatorizados}{48}
\contentsline {paragraph}{Algoritmos tipo Montecarlo}{48}
\contentsline {paragraph}{Algoritmos tipo Las Vegas}{48}
\contentsline {subsection}{\numberline {1.6.1}Algoritmo de Fermat}{49}
\contentsline {subsection}{\numberline {1.6.2}Algoritmo de Miller-Rabin}{50}
\contentsline {subsection}{\numberline {1.6.3}Random Walks}{50}
\contentsline {subsection}{\numberline {1.6.4}Markov Chains}{51}
\contentsline {subsubsection}{Modelando Lenguajes como Procesos de Markov}{52}
\contentsline {subsection}{\numberline {1.6.5}Hill Climbing}{54}
\contentsline {subsection}{\numberline {1.6.6}El Algoritmo Metr\IeC {\'o}polis-Hastings}{54}
\contentsline {subsection}{\numberline {1.6.7}Simulated Annealing}{54}
\contentsline {section}{\numberline {1.7}MCMC}{56}
\contentsline {section}{\numberline {1.8}Gibbs Sampling}{59}
\contentsline {section}{\numberline {1.9}La Uni\IeC {\'o}n hace la fuerza: Ensambles}{59}
\contentsline {chapter}{\numberline {2}KNN}{61}
\contentsline {section}{\numberline {2.1}La M\IeC {\'e}trica a emplear}{62}
\contentsline {subsection}{\numberline {2.1.1}Distancia Minkowsky}{63}
\contentsline {subsection}{\numberline {2.1.2}Distancia Manhattan}{63}
\contentsline {subsection}{\numberline {2.1.3}Distancia de Hamming o Norma $l_0$}{63}
\contentsline {subsection}{\numberline {2.1.4}Norma $l_{\infty }$}{63}
\contentsline {subsection}{\numberline {2.1.5}Distancia de Mahalanobis}{64}
\contentsline {subsection}{\numberline {2.1.6}Distancia Coseno}{65}
\contentsline {subsection}{\numberline {2.1.7}Distancia de Edici\IeC {\'o}n o Levenshtein}{66}
\contentsline {subsection}{\numberline {2.1.8}Distancia de Jaccard}{67}
\contentsline {subsection}{\numberline {2.1.9}Distancia Geod\IeC {\'e}sica}{67}
\contentsline {subsection}{\numberline {2.1.10}Distancia entre Grafos}{68}
\contentsline {subsection}{\numberline {2.1.11}Distancia para atributos categ\IeC {\'o}ricos: VDM}{68}
\contentsline {subsection}{\numberline {2.1.12}Distancias Especiales}{69}
\contentsline {subsection}{\numberline {2.1.13}Aprendiendo la Distancia}{70}
\contentsline {section}{\numberline {2.2}Determinando la Distancia a usar en KNN}{70}
\contentsline {section}{\numberline {2.3}Determinar el valor de K}{71}
\contentsline {subsection}{\numberline {2.3.1}Grid Search y Random Search en KNN}{73}
\contentsline {section}{\numberline {2.4}Sensibilidad a valores fuera de escala}{74}
\contentsline {section}{\numberline {2.5}Sensibilidad a atributos an\IeC {\'o}malos}{75}
\contentsline {subsection}{\numberline {2.5.1}Forward Selection}{75}
\contentsline {subsection}{\numberline {2.5.2}Backward Selection}{75}
\contentsline {section}{\numberline {2.6}Aproximaciones para KNN}{75}
\contentsline {subsection}{\numberline {2.6.1}Orden del algoritmo y eficiencia}{75}
\contentsline {subsection}{\numberline {2.6.2}Indices Espaciales: KD-Trees}{77}
\contentsline {subsection}{\numberline {2.6.3}Indices Espaciales: VP-Trees}{78}
\contentsline {subsection}{\numberline {2.6.4}L\IeC {\'\i }deres y seguidores}{81}
\contentsline {subsection}{\numberline {2.6.5}Aproximaci\IeC {\'o}n con K-Means}{82}
\contentsline {subsection}{\numberline {2.6.6}Editing}{82}
\contentsline {subsection}{\numberline {2.6.7}NN v\IeC {\'\i }a Grafos}{84}
\contentsline {subsection}{\numberline {2.6.8}LSH}{85}
\contentsline {section}{\numberline {2.7}Teor\IeC {\'\i }a de KNN}{85}
\contentsline {subsection}{\numberline {2.7.1}Teorema de Cover-Hart}{85}
\contentsline {section}{\numberline {2.8}Parzen Windows}{87}
\contentsline {section}{\numberline {2.9}KNN con pesos}{87}
\contentsline {section}{\numberline {2.10}Evitando Overfitting en KNN}{89}
\contentsline {section}{\numberline {2.11}RKNN: Ensambles basados en KNN}{89}
\contentsline {section}{\numberline {2.12}Algunas conclusiones y comentarios}{90}
\contentsline {section}{\numberline {2.13}Dimensionalidad}{90}
\contentsline {subsection}{\numberline {2.13.1}Manifolds}{91}
\contentsline {subsection}{\numberline {2.13.2}Los Datos No son Random y Siempre Tienen Pocas Dimensiones}{92}
\contentsline {subsection}{\numberline {2.13.3}Manifold Learning y Cambios de Dimensiones}{93}
\contentsline {subsection}{\numberline {2.13.4}La Maldici\IeC {\'o}n de la dimensionalidad}{94}
\contentsline {subsubsection}{El problema del muestreo}{94}
\contentsline {subsection}{\numberline {2.13.5}El Efecto de la Dimensionalidad sobre las Distancias}{95}
\contentsline {subsubsection}{El problema de los KD-Trees}{96}
\contentsline {subsubsection}{Shared Neighbors}{98}
\contentsline {chapter}{\numberline {3}Visualizaci\IeC {\'o}n}{99}
\contentsline {section}{\numberline {3.1}Principios B\IeC {\'a}sicos}{100}
\contentsline {subsection}{\numberline {3.1.1}No usar tres dimensiones de forma injustificada}{100}
\contentsline {subsection}{\numberline {3.1.2}El uso del plano}{100}
\contentsline {subsection}{\numberline {3.1.3}Oclusi\IeC {\'o}n de Informaci\IeC {\'o}n}{100}
\contentsline {subsection}{\numberline {3.1.4}Mal uso del Plano}{100}
\contentsline {subsection}{\numberline {3.1.5}Capacidad de Retenci\IeC {\'o}n}{101}
\contentsline {subsection}{\numberline {3.1.6}Ceguera al cambio}{101}
\contentsline {subsection}{\numberline {3.1.7}Uso del Color}{101}
\contentsline {subsection}{\numberline {3.1.8}Foco en la Funcionalidad}{101}
\contentsline {section}{\numberline {3.2}Principios de Visualizaci\IeC {\'o}n de Tufte}{102}
\contentsline {subsection}{\numberline {3.2.1}Excelencia}{102}
\contentsline {subsection}{\numberline {3.2.2}Principios}{103}
\contentsline {section}{\numberline {3.3}Scatter Plots}{103}
\contentsline {section}{\numberline {3.4}Gr\IeC {\'a}ficos de Barras}{111}
\contentsline {section}{\numberline {3.5}Histogramas y Plots de Densidad}{115}
\contentsline {section}{\numberline {3.6}Boxplots}{119}
\contentsline {section}{\numberline {3.7}Gr\IeC {\'a}ficos de L\IeC {\'\i }neas}{126}
\contentsline {section}{\numberline {3.8}Gr\IeC {\'a}ficos de \IeC {\'a}rea}{129}
\contentsline {section}{\numberline {3.9}Heatmaps}{132}
\contentsline {section}{\numberline {3.10}Radar Charts}{134}
\contentsline {section}{\numberline {3.11}Coordenadas Paralelas}{135}
\contentsline {section}{\numberline {3.12}Treemaps}{136}
\contentsline {section}{\numberline {3.13}Mapas y Choropleths}{137}
\contentsline {chapter}{\numberline {4}Dataframes y An\IeC {\'a}lisis Exploratorio de Datos}{139}
\contentsline {section}{\numberline {4.1}Introducci\IeC {\'o}n al An\IeC {\'a}lisis Exploratorio de Datos}{139}
\contentsline {section}{\numberline {4.2}Introducci\IeC {\'o}n a Dataframes}{139}
\contentsline {section}{\numberline {4.3}Columnas}{140}
\contentsline {subsection}{\numberline {4.3.1}Operaciones con Columnas}{140}
\contentsline {subsubsection}{map (columnas)}{141}
\contentsline {subsubsection}{apply (columnas)}{141}
\contentsline {section}{\numberline {4.4}Operaciones con Dataframes}{142}
\contentsline {subsection}{\numberline {4.4.1}Proyecci\IeC {\'o}n}{142}
\contentsline {subsection}{\numberline {4.4.2}Subsetting,Filtering o Selecci\IeC {\'o}n}{142}
\contentsline {subsection}{\numberline {4.4.3}Sort}{143}
\contentsline {subsection}{\numberline {4.4.4}Applymap}{143}
\contentsline {subsection}{\numberline {4.4.5}Apply (Dataframes)}{143}
\contentsline {subsection}{\numberline {4.4.6}Manejo de Indices}{143}
\contentsline {subsection}{\numberline {4.4.7}Operaciones de Pivoteo}{143}
\contentsline {subsection}{\numberline {4.4.8}El Paradigma Split, Apply, Combine}{145}
\contentsline {subsubsection}{Ejemplos}{145}
\contentsline {subsection}{\numberline {4.4.9}Join}{146}
\contentsline {subsubsection}{Inner Join}{146}
\contentsline {subsubsection}{Left Join}{146}
\contentsline {subsubsection}{Right Join}{147}
\contentsline {subsubsection}{Full Join}{147}
\contentsline {section}{\numberline {4.5}Ejemplo: IMDB Movies DataSet}{147}
\contentsline {subsection}{\numberline {4.5.1}\IeC {\textquestiondown }Las Pel\IeC {\'\i }culas mas largas son mejores?}{148}
\contentsline {subsection}{\numberline {4.5.2}Los Mejores Directores}{148}
\contentsline {subsection}{\numberline {4.5.3}\IeC {\textquestiondown }Las pel\IeC {\'\i }culas est\IeC {\'a}n mejorando?}{149}
\contentsline {subsection}{\numberline {4.5.4}Duraci\IeC {\'o}n de las Pel\IeC {\'\i }culas a lo largo del tiempo}{150}
\contentsline {subsection}{\numberline {4.5.5}Histograma de Ratings}{151}
\contentsline {subsection}{\numberline {4.5.6}Correlaci\IeC {\'o}n Entre Columnnas}{151}
\contentsline {subsection}{\numberline {4.5.7}Las Mejores Parejas de Actores}{152}
\contentsline {subsection}{\numberline {4.5.8}Mejores Duplas Director-Actor}{152}
\contentsline {subsection}{\numberline {4.5.9}\IeC {\textquestiondown }Qu\IeC {\'e} Directores Hacen que los Actores Actuen Mejor?}{153}
\contentsline {section}{\numberline {4.6}Manejo Interno de DataFrames}{154}
\contentsline {subsection}{\numberline {4.6.1}Valores \IeC {\'U}nicos}{154}
\contentsline {subsection}{\numberline {4.6.2}Factorize}{154}
\contentsline {subsection}{\numberline {4.6.3}Counting Sort}{155}
\contentsline {subsection}{\numberline {4.6.4}Algoritmos para Group By}{157}
\contentsline {chapter}{\numberline {5}Big Data y Almacenamiento Distribuido}{159}
\contentsline {section}{\numberline {5.1}Big Data}{159}
\contentsline {subsection}{\numberline {5.1.1}Volumen}{160}
\contentsline {subsection}{\numberline {5.1.2}Velocidad}{160}
\contentsline {subsection}{\numberline {5.1.3}Variedad}{160}
\contentsline {subsection}{\numberline {5.1.4}Veracidad}{160}
\contentsline {subsection}{\numberline {5.1.5}Valencia}{161}
\contentsline {section}{\numberline {5.2}Almacenamiento Distribuido}{161}
\contentsline {section}{\numberline {5.3}El Ecosistema Hadoop}{162}
\contentsline {section}{\numberline {5.4}Map Reduce}{163}
\contentsline {subsection}{\numberline {5.4.1}Fase de Map}{163}
\contentsline {subsection}{\numberline {5.4.2}Map Partitions}{164}
\contentsline {subsection}{\numberline {5.4.3}Fase de Reduce}{165}
\contentsline {subsubsection}{Reduce propiamente dicho}{165}
\contentsline {subsubsection}{Reduce by Key}{165}
\contentsline {subsection}{\numberline {5.4.4}Fase de Shuffle \& Sort}{166}
\contentsline {section}{\numberline {5.5}Normalizaci\IeC {\'o}n}{166}
\contentsline {section}{\numberline {5.6}KNN}{168}
\contentsline {subsection}{\numberline {5.6.1}NN}{168}
\contentsline {subsection}{\numberline {5.6.2}KNN}{169}
\contentsline {section}{\numberline {5.7}Procesamiento de Textos}{170}
\contentsline {subsection}{\numberline {5.7.1}Wordcount}{170}
\contentsline {subsection}{\numberline {5.7.2}N-gramas}{171}
\contentsline {section}{\numberline {5.8}Join}{173}
\contentsline {subsection}{\numberline {5.8.1}Producto Cartesiano}{173}
\contentsline {subsection}{\numberline {5.8.2}Inner Join y Outer Join}{174}
\contentsline {section}{\numberline {5.9}Operaciones con Matrices}{175}
\contentsline {subsection}{\numberline {5.9.1}Multiplicaci\IeC {\'o}n matriz-vector}{175}
\contentsline {subsection}{\numberline {5.9.2}Multiplicaci\IeC {\'o}n matriz-matriz}{177}
\contentsline {section}{\numberline {5.10}Redes Sociales}{179}
\contentsline {subsection}{\numberline {5.10.1}Relaciones no correspondidas}{179}
\contentsline {subsection}{\numberline {5.10.2}Contando Tri\IeC {\'a}ngulos en una Red Social}{180}
\contentsline {chapter}{\numberline {6}Complejidad, Compresi\IeC {\'o}n y Teor\IeC {\'\i }a de la Informaci\IeC {\'o}n}{183}
\contentsline {section}{\numberline {6.1}Complejidad}{183}
\contentsline {subsection}{\numberline {6.1.1}Complejidad de Kolmogorov}{183}
\contentsline {subsection}{\numberline {6.1.2}Propiedades de K(x)}{184}
\contentsline {subsection}{\numberline {6.1.3}Intractabilidad de K(x)}{187}
\contentsline {subsection}{\numberline {6.1.4}Distancia Normalizada de Compresi\IeC {\'o}n (NCD)}{187}
\contentsline {section}{\numberline {6.2}Teor\IeC {\'\i }a de la Informaci\IeC {\'o}n}{188}
\contentsline {subsection}{\numberline {6.2.1}Mensajes y Alfabetos}{188}
\contentsline {subsection}{\numberline {6.2.2}Fuente}{188}
\contentsline {subsection}{\numberline {6.2.3}C\IeC {\'o}digos}{189}
\contentsline {subsubsection}{C\IeC {\'o}digos prefijos}{189}
\contentsline {subsubsection}{Desigualdad de Kraft}{190}
\contentsline {subsection}{\numberline {6.2.4}Entrop\'i{a}}{190}
\contentsline {subsection}{\numberline {6.2.5}Entrop\IeC {\'\i }a Conjunta}{193}
\contentsline {subsection}{\numberline {6.2.6}Entrop\IeC {\'\i }a Condicional}{194}
\contentsline {subsection}{\numberline {6.2.7}Informaci\IeC {\'o}n Mutua}{196}
\contentsline {subsection}{\numberline {6.2.8}Ejemplo}{197}
\contentsline {subsection}{\numberline {6.2.9}Entrop\IeC {\'\i }a Relativa}{198}
\contentsline {subsection}{\numberline {6.2.10}Entrop\IeC {\'\i }a Cruzada}{199}
\contentsline {section}{\numberline {6.3}Teor\IeC {\'\i }a de la Compresi\IeC {\'o}n de Datos}{199}
\contentsline {subsection}{\numberline {6.3.1}Modelar los Datos}{200}
\contentsline {subsection}{\numberline {6.3.2}Codificar los Datos}{200}
\contentsline {subsection}{\numberline {6.3.3}No Existe un Compresor que Pueda Comprimir Cualquier Archivo}{200}
\contentsline {section}{\numberline {6.4}C\IeC {\'o}digos de Huffman}{201}
\contentsline {subsection}{\numberline {6.4.1}Huffman Est\IeC {\'a}tico}{201}
\contentsline {subsection}{\numberline {6.4.2}Huffman Din\IeC {\'a}mico}{203}
\contentsline {subsection}{\numberline {6.4.3}Modelos de Orden Superior}{204}
\contentsline {subsubsection}{El problema de frecuencia cero}{206}
\contentsline {section}{\numberline {6.5}Compresi\IeC {\'o}n Aritm\IeC {\'e}tica}{207}
\contentsline {subsection}{\numberline {6.5.1}Implementaci\IeC {\'o}n y Renormalizaci\IeC {\'o}n}{208}
\contentsline {section}{\numberline {6.6}PPMC}{209}
\contentsline {subsection}{\numberline {6.6.1}Ejemplo PPMC}{210}
\contentsline {subsubsection}{Comprimir A}{210}
\contentsline {subsubsection}{Comprimir (A)B}{211}
\contentsline {subsubsection}{Comprimir (AB)C}{212}
\contentsline {subsubsection}{Comprimir A(BC)C}{213}
\contentsline {subsubsection}{Comprimir AB(CC)C}{214}
\contentsline {subsubsection}{Comprimir ABC(CC)C}{215}
\contentsline {subsubsection}{Comprimir ABCC(CC)}{215}
\contentsline {subsection}{\numberline {6.6.2}PPMC Aspectos claves}{217}
\contentsline {subsection}{\numberline {6.6.3}Descompresi\IeC {\'o}n}{218}
\contentsline {subsection}{\numberline {6.6.4}Orden M\IeC {\'a}ximo para PPMC}{218}
\contentsline {subsection}{\numberline {6.6.5}Variantes de PPMC}{219}
\contentsline {subsection}{\numberline {6.6.6}PPMQ y SSE}{219}
\contentsline {subsection}{\numberline {6.6.7}Modelos Generativos Basados en PPMC}{220}
\contentsline {subsection}{\numberline {6.6.8}PPMC conclusiones}{222}
\contentsline {section}{\numberline {6.7}DMC: Dynamic Markov Compression}{222}
\contentsline {subsection}{\numberline {6.7.1}Clonaci\IeC {\'o}n}{223}
\contentsline {section}{\numberline {6.8}La Familia LZ de Compresores}{224}
\contentsline {subsection}{\numberline {6.8.1}RLE}{225}
\contentsline {subsection}{\numberline {6.8.2}LZSS}{225}
\contentsline {subsubsection}{Parsing en LZSS}{227}
\contentsline {subsection}{\numberline {6.8.3}DEFLATE (LzHuf)}{228}
\contentsline {subsection}{\numberline {6.8.4}LZW}{228}
\contentsline {subsubsection}{El Descompresor LZW}{232}
\contentsline {subsubsection}{Los Dragones de LZW}{233}
\contentsline {subsubsection}{Mecanismo de Clearing}{233}
\contentsline {subsubsection}{Suboptimalidad de la codificaci\IeC {\'o}n en LZW}{234}
\contentsline {subsubsection}{Complejidad LZ}{234}
\contentsline {subsection}{\numberline {6.8.5}Snappy}{235}
\contentsline {subsubsection}{Literales}{235}
\contentsline {subsubsection}{Matches}{236}
\contentsline {subsection}{\numberline {6.8.6}LZMA}{236}
\contentsline {subsubsection}{Tipos de C\IeC {\'o}digos}{237}
\contentsline {subsubsection}{Codificaci\IeC {\'o}n Aritm\IeC {\'e}tica}{238}
\contentsline {subsubsection}{Tama\IeC {\~n}o de la Ventana y Optimal Parsing}{239}
\contentsline {subsubsection}{Uso de Exclusi\IeC {\'o}n}{239}
\contentsline {section}{\numberline {6.9}Block Sorting}{239}
\contentsline {subsection}{\numberline {6.9.1}Archivos localizados y MTF}{239}
\contentsline {subsection}{\numberline {6.9.2}La Transformaci\IeC {\'o}n de Burrows y Wheeler}{241}
\contentsline {section}{\numberline {6.10}\IeC {\'I}ndices FM}{242}
\contentsline {subsection}{\numberline {6.10.1}\IeC {\'I}ndices de Sufijos}{243}
\contentsline {subsection}{\numberline {6.10.2}Mapeo LF}{243}
\contentsline {section}{\numberline {6.11}Comprimiendo el \IeC {\'I}ndice FM}{245}
\contentsline {section}{\numberline {6.12}Estructura Final del \IeC {\'I}ndice FM}{246}
\contentsline {section}{\numberline {6.13}PAQ}{247}
\contentsline {subsection}{\numberline {6.13.1}Modelos de PAQ}{248}
\contentsline {subsection}{\numberline {6.13.2}SSE}{248}
\contentsline {chapter}{\numberline {7}Hashing}{251}
\contentsline {section}{\numberline {7.1}Introducci\IeC {\'o}n}{251}
\contentsline {subsection}{\numberline {7.1.1}Funciones de Hashing Elementales}{251}
\contentsline {subsubsection}{El espacio de claves es mayor al espacio de direcciones}{252}
\contentsline {subsubsection}{El espacio de direcciones es mayor al espacio de claves}{252}
\contentsline {subsection}{\numberline {7.1.2}Tablas de Hash Elementales}{252}
\contentsline {subsection}{\numberline {7.1.3}Resoluci\IeC {\'o}n de Colisiones}{253}
\contentsline {subsubsection}{B\IeC {\'u}squeda lineal (direccionamiento abierto o hashing cerrado)}{253}
\contentsline {subsubsection}{Encadenamiento de Sin\IeC {\'o}nimos (direccionamiento cerrado o hashing abierto)}{253}
\contentsline {section}{\numberline {7.2}Implementaci\IeC {\'o}n de Diccionarios con Hash Tables}{254}
\contentsline {section}{\numberline {7.3}Hopscotch Hashing}{256}
\contentsline {section}{\numberline {7.4}Funciones de Hashing}{257}
\contentsline {subsection}{\numberline {7.4.1}FNV}{258}
\contentsline {subsection}{\numberline {7.4.2}Jenkins}{258}
\contentsline {subsection}{\numberline {7.4.3}Pearson}{259}
\contentsline {subsection}{\numberline {7.4.4}Funciones criptogr\IeC {\'a}ficas}{260}
\contentsline {subsubsection}{Construcci\IeC {\'o}n de Merkle-Damgard}{260}
\contentsline {subsubsection}{Construcci\IeC {\'o}n de Davis-Meyer}{261}
\contentsline {section}{\numberline {7.5}Hashing Universal}{262}
\contentsline {subsection}{\numberline {7.5.1}Definici\IeC {\'o}n de Hashing Universal}{262}
\contentsline {subsection}{\numberline {7.5.2}Hashing Universal para valores at\IeC {\'o}micos (num\IeC {\'e}ricos)}{262}
\contentsline {subsection}{\numberline {7.5.3}Hashing Universal para claves de longitud fija}{263}
\contentsline {subsection}{\numberline {7.5.4}Hashing Universal para claves de longitud variable}{264}
\contentsline {section}{\numberline {7.6}Cuckoo Hashing}{265}
\contentsline {subsection}{\numberline {7.6.1}Cuckoo Hashing With a Stash}{268}
\contentsline {subsection}{\numberline {7.6.2}Estructuras de Datos basadas en Cuckoo Hashing}{268}
\contentsline {section}{\numberline {7.7}Balls \& Bins}{270}
\contentsline {section}{\numberline {7.8}Hashing Perfecto}{271}
\contentsline {subsection}{\numberline {7.8.1}Esquema FKS}{271}
\contentsline {subsection}{\numberline {7.8.2}Hashing Perfecto Din\IeC {\'a}mico}{273}
\contentsline {subsection}{\numberline {7.8.3}Hashing Perfecto y M\IeC {\'\i }nimo: Hash and Displace}{273}
\contentsline {subsection}{\numberline {7.8.4}Hashing Perfecto y M\IeC {\'\i }nimo: Algoritmo II}{276}
\contentsline {section}{\numberline {7.9}Hashing Consistente}{279}
\contentsline {section}{\numberline {7.10}The Hashing Trick}{281}
\contentsline {subsection}{\numberline {7.10.1}M\IeC {\'e}todo de Weinberger y Hash Kernels}{282}
\contentsline {subsection}{\numberline {7.10.2}Filtros de Spam Personalizados}{283}
\contentsline {section}{\numberline {7.11}El Teorema de Johnson y Lindenstrauss}{284}
\contentsline {subsection}{\numberline {7.11.1}Primera Proyecci\IeC {\'o}n}{284}
\contentsline {subsection}{\numberline {7.11.2}Segunda Proyecci\IeC {\'o}n}{285}
\contentsline {subsection}{\numberline {7.11.3}Tercera Proyecci\IeC {\'o}n}{285}
\contentsline {subsection}{\numberline {7.11.4}Por qu\IeC {\'e} funciona el Teorema de JL}{286}
\contentsline {subsection}{\numberline {7.11.5}The Hashing Trick y el Teorema de Johnson Lindenstrauss}{289}
\contentsline {chapter}{\numberline {8}LSH}{293}
\contentsline {section}{\numberline {8.1}Minhashes}{294}
\contentsline {section}{\numberline {8.2}Amplificaci\IeC {\'o}n de Familias LSH}{296}
\contentsline {subsection}{\numberline {8.2.1}Reducci\IeC {\'o}n de Falsos Positivos}{296}
\contentsline {subsection}{\numberline {8.2.2}Reducci\IeC {\'o}n de Falsos Negativos}{297}
\contentsline {subsection}{\numberline {8.2.3}Combinando los dos m\IeC {\'e}todos}{298}
\contentsline {subsection}{\numberline {8.2.4}Encontrando los valores de $b$ y $r$}{299}
\contentsline {section}{\numberline {8.3}LSH para la distancia de Jaccard}{300}
\contentsline {section}{\numberline {8.4}Minhash para la distancia angular}{303}
\contentsline {subsection}{\numberline {8.4.1}El m\IeC {\'e}todo de los hiperplanos}{304}
\contentsline {subsection}{\numberline {8.4.2}Voronoi Hashing}{305}
\contentsline {subsection}{\numberline {8.4.3}El M\IeC {\'e}todo de los Pol\IeC {\'\i }topos}{306}
\contentsline {section}{\numberline {8.5}LSH para la distancia euclideana}{307}
\contentsline {subsection}{\numberline {8.5.1}El M\IeC {\'e}todo de las Proyecciones Aleatorias}{307}
\contentsline {subsection}{\numberline {8.5.2}M\IeC {\'e}todos basados en K-Means: KLSH}{309}
\contentsline {subsection}{\numberline {8.5.3}M\IeC {\'e}todos basados en K-Means: K-Means jer\IeC {\'a}rquico}{310}
\contentsline {section}{\numberline {8.6}LSH para la distancia Hamming}{310}
\contentsline {section}{\numberline {8.7}Multi-Probe LSH}{310}
\contentsline {subsection}{\numberline {8.7.1}Multi-Probing para la distancia euclideana, m\IeC {\'e}todo de proyecciones aleatorias}{311}
\contentsline {subsection}{\numberline {8.7.2}Multi-Probing para la distancia euclideana, m\IeC {\'e}todos basados en K-Means}{311}
\contentsline {subsection}{\numberline {8.7.3}Multi-Probing para la distancia angular, m\IeC {\'e}todo de proyecciones aleatorias}{311}
\contentsline {subsection}{\numberline {8.7.4}Multi-Probing para la distancia angular, m\IeC {\'e}todo de los pol\IeC {\'\i }topos}{312}
\contentsline {subsection}{\numberline {8.7.5}Multi-Probing para la distancia de Jaccard}{312}
\contentsline {section}{\numberline {8.8}LSH Conclusiones}{312}
\contentsline {section}{\numberline {8.9}LSH Para M\IeC {\'a}xima semejanza}{313}
\contentsline {subsection}{\numberline {8.9.1}Usando la Longitud}{313}
\contentsline {subsection}{\numberline {8.9.2}Usando Prefijos}{314}
\contentsline {subsection}{\numberline {8.9.3}Prefijos y Posiciones}{314}
\contentsline {subsection}{\numberline {8.9.4}Prefijos, Posiciones y Longitud}{315}
\contentsline {subsubsection}{Primer caso: $p \geq q$}{316}
\contentsline {subsubsection}{Segundo caso: $p < q$}{316}
\contentsline {section}{\numberline {8.10}LSH forests}{317}
\contentsline {chapter}{\numberline {9}Reducci\IeC {\'o}n de Dimensiones}{319}
\contentsline {section}{\numberline {9.1}Introducci\IeC {\'o}n}{319}
\contentsline {subsection}{\numberline {9.1.1}Para Combatir la Maldici\IeC {\'o}n de la Dimensionalidad}{319}
\contentsline {subsection}{\numberline {9.1.2}Por motivos de performance}{319}
\contentsline {subsection}{\numberline {9.1.3}Como un mecanismo de Feature Engineering}{320}
\contentsline {subsection}{\numberline {9.1.4}Para visualizar los datos}{320}
\contentsline {section}{\numberline {9.2}SVD}{320}
\contentsline {subsection}{\numberline {9.2.1}Interpretaci\IeC {\'o}n algebraica de la SVD (muy importante!)}{323}
\contentsline {subsection}{\numberline {9.2.2}Aproximaci\IeC {\'o}n de Rango $k$}{323}
\contentsline {subsection}{\numberline {9.2.3}Usando la SVD para aproximar la dimensionalidad intr\IeC {\'\i }nseca de los datos}{326}
\contentsline {subsection}{\numberline {9.2.4}Reducci\IeC {\'o}n de dimensiones usando la SVD}{327}
\contentsline {subsection}{\numberline {9.2.5}Compresi\IeC {\'o}n de Im\IeC {\'a}genes usando la SVD}{327}
\contentsline {subsection}{\numberline {9.2.6}Eigenfaces}{329}
\contentsline {subsection}{\numberline {9.2.7}SVD en la pr\IeC {\'a}ctica}{331}
\contentsline {subsection}{\numberline {9.2.8}Agregando nuevos datos en el nuevo espacio}{331}
\contentsline {section}{\numberline {9.3}PCA: Principal Component Analysis}{332}
\contentsline {subsection}{\numberline {9.3.1}PCA=SVD}{334}
\contentsline {subsection}{\numberline {9.3.2}Conclusi\IeC {\'o}n}{335}
\contentsline {section}{\numberline {9.4}MDS: Multidimensional Scaling}{335}
\contentsline {subsection}{\numberline {9.4.1}Perceptual Mapping}{339}
\contentsline {section}{\numberline {9.5}ISOMAP}{339}
\contentsline {subsection}{\numberline {9.5.1}Construcci\IeC {\'o}n del Grafo}{339}
\contentsline {subsection}{\numberline {9.5.2}C\IeC {\'a}lculo de Distancias}{339}
\contentsline {subsection}{\numberline {9.5.3}MDS}{340}
\contentsline {section}{\numberline {9.6}Laplacian Eigenmaps}{340}
\contentsline {subsection}{\numberline {9.6.1}Creaci\IeC {\'o}n del Grafo}{341}
\contentsline {subsection}{\numberline {9.6.2}Construcci\IeC {\'o}n del Laplaciano}{341}
\contentsline {subsection}{\numberline {9.6.3}Descomposici\IeC {\'o}n Espectral del Laplaciano}{342}
\contentsline {subsection}{\numberline {9.6.4}Teor\IeC {\'\i }a de la descomposici\IeC {\'o}n espectral}{342}
\contentsline {subsection}{\numberline {9.6.5}Ejemplos}{344}
\contentsline {section}{\numberline {9.7}TSNE}{346}
\contentsline {section}{\numberline {9.8}Teorema Fundamental de la Dimensionalidad}{348}
\contentsline {chapter}{\numberline {10}Information Retrieval}{349}
\contentsline {section}{\numberline {10.1}Consultas Puntuales y el Dominio de Trabajo}{350}
\contentsline {section}{\numberline {10.2}\IeC {\'\i }ndices Invertidos}{350}
\contentsline {subsection}{\numberline {10.2.1}Estructura de un \IeC {\'\i }ndice Invertido}{351}
\contentsline {subsection}{\numberline {10.2.2}Almacenamiento del L\IeC {\'e}xico}{351}
\contentsline {subsubsection}{L\IeC {\'e}xico Concatenado}{351}
\contentsline {subsubsection}{Front Coding}{352}
\contentsline {subsubsection}{Mucha memoria = Mejor Performance}{353}
\contentsline {subsection}{\numberline {10.2.3}Almacenamiento de Punteros}{353}
\contentsline {subsection}{\numberline {10.2.4}Estructura General del \IeC {\'\i }ndice}{354}
\contentsline {subsection}{\numberline {10.2.5}Almacenamiento de Punteros -Avanzado-}{355}
\contentsline {subsubsection}{C\IeC {\'o}digo Unario}{355}
\contentsline {subsubsection}{C\IeC {\'o}digos Gamma}{355}
\contentsline {subsubsection}{C\IeC {\'o}digos Delta}{356}
\contentsline {subsubsection}{Modelo Local de Tipo Bernoulli}{356}
\contentsline {subsection}{\numberline {10.2.6}Construcci\IeC {\'o}n del \IeC {\'\i }ndice Invertido}{359}
\contentsline {subsection}{\numberline {10.2.7}Qu\IeC {\'e} Indexar}{360}
\contentsline {subsubsection}{Parsing}{360}
\contentsline {subsubsection}{Case Folding}{361}
\contentsline {subsubsection}{Stop Words}{361}
\contentsline {subsubsection}{Stemming}{361}
\contentsline {section}{\numberline {10.3}Consultas de Frases y Proximidad}{361}
\contentsline {section}{\numberline {10.4}Signature Files}{363}
\contentsline {subsection}{\numberline {10.4.1}Consultas}{363}
\contentsline {section}{\numberline {10.5}Bitmaps}{364}
\contentsline {subsection}{\numberline {10.5.1}Compresi\IeC {\'o}n de Bitmaps}{365}
\contentsline {subsection}{\numberline {10.5.2}Roaring Bitmaps}{366}
\contentsline {section}{\numberline {10.6}Consultas Ranqueadas}{367}
\contentsline {subsection}{\numberline {10.6.1}BOW y Productos Internos}{367}
\contentsline {subsection}{\numberline {10.6.2}TF-IDF}{368}
\contentsline {subsubsection}{BM25}{370}
\contentsline {subsubsection}{Normalizaci\IeC {\'o}n de la Longitud de los documentos}{370}
\contentsline {section}{\numberline {10.7}Indexaci\IeC {\'o}n Sem\IeC {\'a}ntica Latente}{371}
\contentsline {section}{\numberline {10.8}Evaluaci\IeC {\'o}n de Consultas}{372}
\contentsline {section}{\numberline {10.9}Consultas Probabil\IeC {\'\i }sticas}{374}
\contentsline {section}{\numberline {10.10}Procesamiento de Lenguaje Natural}{374}
\contentsline {subsection}{\numberline {10.10.1}Modelo de N-Gramas}{375}
\contentsline {subsection}{\numberline {10.10.2}Calculando Probabilidades en el modelo de N-Gramas}{376}
\contentsline {subsection}{\numberline {10.10.3}Maximum Likelihood (M\IeC {\'a}xima semejanza)}{377}
\contentsline {subsection}{\numberline {10.10.4}El Problema de las Probabilidades Nulas}{378}
\contentsline {subsection}{\numberline {10.10.5}Smoothing: Correcci\IeC {\'o}n de Laplace}{378}
\contentsline {subsection}{\numberline {10.10.6}Interpolaci\IeC {\'o}n y Backoff}{379}
\contentsline {subsubsection}{Stupid Backoff}{380}
\contentsline {subsection}{\numberline {10.10.7}Good Turing Smoothing}{380}
\contentsline {subsection}{\numberline {10.10.8}Algoritmo de Kneser-Ney}{381}
\contentsline {section}{\numberline {10.11}Learning to Rank}{384}
\contentsline {subsection}{\numberline {10.11.1}Algoritmos Pointwise}{384}
\contentsline {subsection}{\numberline {10.11.2}Algoritmos Pairwise y Listwise}{387}
\contentsline {chapter}{\numberline {11}Introducci\IeC {\'o}n a Machine Learning}{389}
\contentsline {section}{\numberline {11.1}Evaluaci\IeC {\'o}n de Algoritmos de ML}{390}
\contentsline {subsection}{\numberline {11.1.1}Cross Validation}{390}
\contentsline {section}{\numberline {11.2}Los Problemas de ML: Overfitting y Underfitting}{391}
\contentsline {section}{\numberline {11.3}El Teorema de No Free Lunch}{394}
\contentsline {section}{\numberline {11.4}La Uni\IeC {\'o}n hace la fuerza: Ensambles}{396}
\contentsline {subsection}{\numberline {11.4.1}Bagging}{396}
\contentsline {subsection}{\numberline {11.4.2}Boosting}{398}
\contentsline {subsection}{\numberline {11.4.3}Combinando Algoritmos Diferentes}{398}
\contentsline {subsubsection}{Majority Voting}{398}
\contentsline {subsubsection}{Averaging}{399}
\contentsline {subsubsection}{Blending}{400}
\contentsline {paragraph}{Blending}{401}
\contentsline {section}{\numberline {11.5}Teor\IeC {\'\i }a de Machine Learning}{402}
\contentsline {subsection}{\numberline {11.5.1}Desigualdad de Hoeffding}{402}
\contentsline {subsection}{\numberline {11.5.2}Error de Entrenamiento}{402}
\contentsline {subsection}{\numberline {11.5.3}Error de Generalizaci\IeC {\'o}n}{403}
\contentsline {subsection}{\numberline {11.5.4}Overfitting y Underfitting}{403}
\contentsline {subsection}{\numberline {11.5.5}El Problema de Clasificaci\IeC {\'o}n}{403}
\contentsline {subsection}{\numberline {11.5.6}El Espacio de Hip\IeC {\'o}tesis}{404}
\contentsline {subsection}{\numberline {11.5.7}Minimizaci\IeC {\'o}n Emp\IeC {\'\i }rica del Riesgo en $\mathcal {H}$}{404}
\contentsline {subsection}{\numberline {11.5.8}Cuando $|\mathcal {H}|$ es finito}{404}
\contentsline {subsection}{\numberline {11.5.9}Cuando $|\mathcal {H}|$ es infinito}{408}
\contentsline {subsection}{\numberline {11.5.10}El Principio de la M\IeC {\'\i }nima Descripci\IeC {\'o}n}{411}
\contentsline {subsection}{\numberline {11.5.11}Clasificaci\IeC {\'o}n con Algoritmos de Compresi\IeC {\'o}n}{413}
\contentsline {subsubsection}{SMDL: Standard Minimum Description Length}{413}
\contentsline {subsubsection}{AMDL: Approximate Minimum Description Length}{414}
\contentsline {subsubsection}{BCN: Best Compression Neighbor}{414}
\contentsline {chapter}{\numberline {12}Clasificaci\IeC {\'o}n}{415}
\contentsline {section}{\numberline {12.1}\IeC {\'a}rboles de decisi\IeC {\'o}n}{415}
\contentsline {subsection}{\numberline {12.1.1}ID3}{415}
\contentsline {subsection}{\numberline {12.1.2}C4.5}{418}
\contentsline {section}{\numberline {12.2}Random Forests}{419}
\contentsline {subsection}{\numberline {12.2.1}Distancia RF}{420}
\contentsline {section}{\numberline {12.3}XGBoost}{421}
\contentsline {subsection}{\numberline {12.3.1}Boosting Trees}{421}
\contentsline {subsection}{\numberline {12.3.2}Complejidad de un \IeC {\'a}rbol}{423}
\contentsline {subsection}{\numberline {12.3.3}Optimizando la funci\IeC {\'o}n objetivo}{424}
\contentsline {subsection}{\numberline {12.3.4}Calculando el Score de una Estructura}{425}
\contentsline {subsection}{\numberline {12.3.5}Buscando la Estructura del \IeC {\'a}rbol}{425}
\contentsline {subsection}{\numberline {12.3.6}XGBoost en la pr\IeC {\'a}ctica}{426}
\contentsline {section}{\numberline {12.4}Na\"{i}ve Bayes}{427}
\contentsline {subsection}{\numberline {12.4.1}El Problema de las probabilidades cero}{428}
\contentsline {subsection}{\numberline {12.4.2}Bayes en acci\IeC {\'o}n}{429}
\contentsline {section}{\numberline {12.5}Perceptr\IeC {\'o}n}{429}
\contentsline {subsection}{\numberline {12.5.1}Algoritmo Base}{430}
\contentsline {subsection}{\numberline {12.5.2}Algoritmo Mejorado}{431}
\contentsline {subsection}{\numberline {12.5.3}Normalizaci\IeC {\'o}n Online}{432}
\contentsline {subsection}{\numberline {12.5.4}Predicciones}{432}
\contentsline {subsection}{\numberline {12.5.5}Convergencia}{432}
\contentsline {subsection}{\numberline {12.5.6}Funciones de Activaci\IeC {\'o}n}{434}
\contentsline {subsubsection}{Signo}{434}
\contentsline {subsubsection}{Sigmoid}{434}
\contentsline {subsubsection}{Tanh}{435}
\contentsline {subsection}{\numberline {12.5.7}Perceptr\IeC {\'o}n Multiclase}{436}
\contentsline {subsubsection}{One vs All}{436}
\contentsline {subsubsection}{One vs One}{436}
\contentsline {subsection}{\numberline {12.5.8}Conclusiones}{437}
\contentsline {section}{\numberline {12.6}SVM}{437}
\contentsline {subsection}{\numberline {12.6.1}Margen Funcional}{439}
\contentsline {subsection}{\numberline {12.6.2}Margen Geom\IeC {\'e}trico}{439}
\contentsline {subsection}{\numberline {12.6.3}Support Vectors}{441}
\contentsline {subsection}{\numberline {12.6.4}El Problema Dual}{444}
\contentsline {subsection}{\numberline {12.6.5}Algoritmo SMO (Sequential Minimal Optimization) [opcional]}{446}
\contentsline {subsection}{\numberline {12.6.6}Soft Margin}{448}
\contentsline {subsection}{\numberline {12.6.7}SMO (Soft Marging)}{450}
\contentsline {section}{\numberline {12.7}The Kernel Trick}{451}
\contentsline {subsection}{\numberline {12.7.1}Kernel Polin\IeC {\'o}mico}{455}
\contentsline {section}{\numberline {12.8}Teorema de Mercer}{455}
\contentsline {subsection}{\numberline {12.8.1}Kernel Gaussiano (RBF)}{456}
\contentsline {section}{\numberline {12.9}Aproximaci\IeC {\'o}n de Nystrom}{458}
\contentsline {section}{\numberline {12.10}Kernel SVM}{459}
\contentsline {subsection}{\numberline {12.10.1}El efecto de C y $\sigma $}{460}
\contentsline {section}{\numberline {12.11}SVM Transductivo}{461}
\contentsline {section}{\numberline {12.12}SVM Ordinal}{463}
\contentsline {section}{\numberline {12.13}Conclusiones}{463}
\contentsline {chapter}{\numberline {13}Clustering}{465}
\contentsline {section}{\numberline {13.1}Clustering Jer\IeC {\'a}rquico}{466}
\contentsline {subsection}{\numberline {13.1.1}Distancia Entre Clusters}{466}
\contentsline {subsection}{\numberline {13.1.2}Encontrando la Cantidad de Clusters}{467}
\contentsline {subsection}{\numberline {13.1.3}Dendrogramas}{468}
\contentsline {subsection}{\numberline {13.1.4}Performance}{470}
\contentsline {section}{\numberline {13.2}K-Means}{470}
\contentsline {subsection}{\numberline {13.2.1}El Problema General}{471}
\contentsline {subsection}{\numberline {13.2.2}El Algoritmo de Lloyd (conocido como K-Means)}{472}
\contentsline {subsection}{\numberline {13.2.3}K-Means++}{475}
\contentsline {subsection}{\numberline {13.2.4}Como buscar K}{477}
\contentsline {subsection}{\numberline {13.2.5}Diagramas de Voronoi}{478}
\contentsline {subsection}{\numberline {13.2.6}Kmeans Online}{479}
\contentsline {subsubsection}{Teor\IeC {\'\i }a de K-Means Online}{481}
\contentsline {subsection}{\numberline {13.2.7}Soft K-Means}{482}
\contentsline {subsubsection}{Versi\IeC {\'o}n Online}{484}
\contentsline {subsection}{\numberline {13.2.8}La base de Kmeans y sus aplicaciones}{485}
\contentsline {subsection}{\numberline {13.2.9}Reducci\IeC {\'o}n de Dimensiones con K-Means}{488}
\contentsline {subsection}{\numberline {13.2.10}Kmeans Esf\IeC {\'e}rico}{489}
\contentsline {subsubsection}{Versi\IeC {\'o}n Online}{491}
\contentsline {subsubsection}{Voronoi Esf\IeC {\'e}rico}{491}
\contentsline {subsection}{\numberline {13.2.11}Kernel K-Means}{491}
\contentsline {subsection}{\numberline {13.2.12}K-Modes}{494}
\contentsline {subsection}{\numberline {13.2.13}K-Means y el Problema de los Vecinos Mas Cercanos}{494}
\contentsline {subsubsection}{Datos Masivos, realmente masivos}{495}
\contentsline {subsection}{\numberline {13.2.14}The Inverted Multi-Index}{496}
\contentsline {subsubsection}{K-means Trees}{498}
\contentsline {section}{\numberline {13.3}Clustering Espectral}{498}
\contentsline {subsection}{\numberline {13.3.1}Primer paso: construir la matriz de afinidad}{498}
\contentsline {subsection}{\numberline {13.3.2}Segundo paso: construir la matriz Laplaciana}{499}
\contentsline {subsection}{\numberline {13.3.3}Tercer paso: Autovalores y Autovectores de la matriz Laplaciana}{500}
\contentsline {subsection}{\numberline {13.3.4}Selecci\IeC {\'o}n de Hiper-Par\IeC {\'a}metros}{502}
\contentsline {subsection}{\numberline {13.3.5}Aproximaciones: KASP}{503}
\contentsline {section}{\numberline {13.4}CURE}{504}
\contentsline {section}{\numberline {13.5}DBScan}{505}
\contentsline {section}{\numberline {13.6}HDBScan}{507}
\contentsline {subsection}{\numberline {13.6.1}La Distancia MRD}{508}
\contentsline {subsection}{\numberline {13.6.2}El \IeC {\'a}rbol Generador M\IeC {\'\i }nimo}{508}
\contentsline {subsection}{\numberline {13.6.3}Clustering Jer\IeC {\'a}rquico}{509}
\contentsline {subsection}{\numberline {13.6.4}Extraer los Clusters}{510}
\contentsline {section}{\numberline {13.7}Biclustering}{513}
\contentsline {subsection}{\numberline {13.7.1}Co-clustering Espectral}{514}
\contentsline {subsection}{\numberline {13.7.2}LSI Co-clustering}{516}
\contentsline {section}{\numberline {13.8}NMF: Non-negative matrix factorization}{517}
\contentsline {subsection}{\numberline {13.8.1}Algoritmo Multiplicativo}{518}
\contentsline {subsection}{\numberline {13.8.2}La Base de NMF}{519}
\contentsline {chapter}{\numberline {14}Sistemas de Recomendaciones}{523}
\contentsline {section}{\numberline {14.1}Introducci\IeC {\'o}n: El modelo de descubrimiento}{523}
\contentsline {section}{\numberline {14.2}Caracter\IeC {\'\i }sticas de un Sistema de Recomendaciones}{524}
\contentsline {subsection}{\numberline {14.2.1}Precisi\IeC {\'o}n}{524}
\contentsline {subsection}{\numberline {14.2.2}Serendipity}{524}
\contentsline {subsection}{\numberline {14.2.3}Diversidad}{526}
\contentsline {subsection}{\numberline {14.2.4}Otros desaf\IeC {\'\i }os para un sistema de recomendaciones}{526}
\contentsline {subsubsection}{Los gustos de los usuarios pueden cambiar}{526}
\contentsline {subsubsection}{La influencia del tiempo}{527}
\contentsline {subsubsection}{El usuario a veces quiere ver cosas que no le gustan}{527}
\contentsline {subsubsection}{Lo que el usuario califica vs lo que el usuario ve}{527}
\contentsline {section}{\numberline {14.3}Sistemas No Personalizados}{528}
\contentsline {subsection}{\numberline {14.3.1}Recomendando Comentarios en Reddit}{528}
\contentsline {subsubsection}{Primera Aproximaci\IeC {\'o}n Diferencia entre Upvotes y Downvotes}{528}
\contentsline {subsubsection}{Seugunda Aproximaci\IeC {\'o}n Proporci\IeC {\'o}n de Upvotes sobre el total}{528}
\contentsline {subsection}{\numberline {14.3.2}Intervalos de Confianza}{529}
\contentsline {subsection}{\numberline {14.3.3}Ordenando las noticias en Reddit}{530}
\contentsline {section}{\numberline {14.4}Sistemas Basados en Contenidos (Content Based)}{532}
\contentsline {section}{\numberline {14.5}Collaborative Filtering}{533}
\contentsline {subsection}{\numberline {14.5.1}Semejanza User User}{534}
\contentsline {subsection}{\numberline {14.5.2}Semejanza Item Item}{535}
\contentsline {subsection}{\numberline {14.5.3}Collaborative Filtering en base a Desviaciones}{537}
\contentsline {section}{\numberline {14.6}Evaluaci\IeC {\'o}n de Sistemas de Recomendaci\IeC {\'o}n}{539}
\contentsline {subsection}{\numberline {14.6.1}Evaluaci\IeC {\'o}n basada en las predicciones}{540}
\contentsline {subsection}{\numberline {14.6.2}Evaluaci\IeC {\'o}n basada en el orden de las recomendaciones}{541}
\contentsline {subsubsection}{MPR: Mean Reciprocal Rank}{541}
\contentsline {subsubsection}{MAP: Mean Average Precision}{542}
\contentsline {subsubsection}{Correlaci\IeC {\'o}n de Spearman}{542}
\contentsline {subsubsection}{Normalized Discounted Cumulative Gain}{542}
\contentsline {section}{\numberline {14.7}Modelos Latentes}{543}
\contentsline {subsection}{\numberline {14.7.1}SVD++}{543}
\contentsline {subsubsection}{Algoritmo de Gradient Descent para calcular $Q$ y $P$}{545}
\contentsline {section}{\numberline {14.8}Factores Temporales}{546}
\contentsline {section}{\numberline {14.9}Factorization Machines}{547}
\contentsline {subsection}{\numberline {14.9.1}Expresividad de FM}{549}
\contentsline {subsection}{\numberline {14.9.2}Como funcionan los par\IeC {\'a}metros de una FM}{549}
\contentsline {subsection}{\numberline {14.9.3}Algoritmos basados en FMs}{549}
\contentsline {subsection}{\numberline {14.9.4}Aprendizaje de FMs}{550}
\contentsline {subsection}{\numberline {14.9.5}LibFM}{550}
\contentsline {section}{\numberline {14.10}El Problema del Cold-Start: Multi-Armed Bandits}{551}
\contentsline {subsection}{\numberline {14.10.1}Multi-Armed Bandits}{552}
\contentsline {subsection}{\numberline {14.10.2}Algoritmo Epsilon}{552}
\contentsline {subsection}{\numberline {14.10.3}Algoritmo UCB1}{552}
\contentsline {section}{\numberline {14.11}Diversidad: Optimizaci\IeC {\'o}n Submodular}{553}
\contentsline {subsection}{\numberline {14.11.1}Recomendaciones Personalizadas}{556}
\contentsline {section}{\numberline {14.12}Learning to Rank}{556}
\contentsline {section}{\numberline {14.13}El Sistema Completo}{557}
\contentsline {chapter}{\numberline {15}Streaming}{559}
\contentsline {section}{\numberline {15.1}Reservoir Sampling}{559}
\contentsline {section}{\numberline {15.2}Momentos de un Stream}{561}
\contentsline {section}{\numberline {15.3}Flajolet-Martin y el HyperLogLog}{562}
\contentsline {subsection}{\numberline {15.3.1}Flajolet-Martin}{562}
\contentsline {subsection}{\numberline {15.3.2}HyperLogLog}{563}
\contentsline {section}{\numberline {15.4}AMS}{563}
\contentsline {section}{\numberline {15.5}DGIM}{564}
\contentsline {subsection}{\numberline {15.5.1}Extensi\IeC {\'o}n para valores enteros}{567}
\contentsline {section}{\numberline {15.6}Decaying Windows}{567}
\contentsline {section}{\numberline {15.7}Filtros de Bloom}{568}
\contentsline {subsection}{\numberline {15.7.1}Extensi\IeC {\'o}n para admitir borrado}{570}
\contentsline {section}{\numberline {15.8}Cuckoo Filters}{570}
\contentsline {section}{\numberline {15.9}Count-Min}{573}
\contentsline {subsection}{\numberline {15.9.1}The Heavy Hitters Problem}{573}
\contentsline {subsubsection}{El problema de los HH aproximado}{574}
\contentsline {subsection}{\numberline {15.9.2}The Count-Min Sketch}{575}
\contentsline {subsection}{\numberline {15.9.3}Claves Transistorias: Bit-Marking}{576}
\contentsline {subsection}{\numberline {15.9.4}Teor\IeC {\'\i }a de Count-Min}{576}
\contentsline {subsubsection}{Conclusiones obtenidas}{580}
\contentsline {chapter}{\numberline {16}PageRank y sus Derivados}{581}
\contentsline {section}{\numberline {16.1}Random Surfers}{582}
\contentsline {section}{\numberline {16.2}Convergencia}{584}
\contentsline {section}{\numberline {16.3}Dead Ends}{586}
\contentsline {section}{\numberline {16.4}Spider Traps y Teletransportaci\IeC {\'o}n}{587}
\contentsline {subsubsection}{Interpretando $\beta $}{589}
\contentsline {section}{\numberline {16.5}Page Rank en la pr\IeC {\'a}ctica}{589}
\contentsline {subsection}{\numberline {16.5.1}PageRank via Map Reduce}{590}
\contentsline {section}{\numberline {16.6}TopicRank}{591}
\contentsline {section}{\numberline {16.7}TrustRank}{592}
\contentsline {section}{\numberline {16.8}SimRank}{593}
\contentsline {subsection}{\numberline {16.8.1}Sim Rank via Montecarlo}{594}
\contentsline {subsection}{\numberline {16.8.2}Panther}{595}
\contentsline {section}{\numberline {16.9}VisualRank}{595}
\contentsline {section}{\numberline {16.10}TextRank}{597}
\contentsline {section}{\numberline {16.11}HITS}{598}
\contentsline {chapter}{\numberline {17}Redes Sociales}{603}
\contentsline {section}{\numberline {17.1}Propiedades de los Grafos}{603}
\contentsline {subsection}{\numberline {17.1.1}Propiedades B\IeC {\'a}sicas}{603}
\contentsline {subsection}{\numberline {17.1.2}Almacenamiento del Grafo}{604}
\contentsline {subsection}{\numberline {17.1.3}Componentes Conexos}{604}
\contentsline {subsection}{\numberline {17.1.4}Di\IeC {\'a}metro}{605}
\contentsline {subsection}{\numberline {17.1.5}Coeficiente de Clustering}{605}
\contentsline {subsection}{\numberline {17.1.6}Distribuci\IeC {\'o}n del Grado}{606}
\contentsline {section}{\numberline {17.2}Propiedades del Grafo de una Red Social}{606}
\contentsline {subsection}{\numberline {17.2.1}Componente Conexo}{606}
\contentsline {subsection}{\numberline {17.2.2}Distribuci\IeC {\'o}n del Grado}{606}
\contentsline {subsection}{\numberline {17.2.3}Di\IeC {\'a}metro y Camino M\IeC {\'\i }nimo Promedio}{607}
\contentsline {subsection}{\numberline {17.2.4}Coeficiente de Clustering}{608}
\contentsline {section}{\numberline {17.3}Modelos de Grafos}{608}
\contentsline {subsection}{\numberline {17.3.1}Modelo de Erd\"{o}s-Renyi}{609}
\contentsline {subsubsection}{Di\IeC {\'a}metro}{609}
\contentsline {subsubsection}{Distribuci\IeC {\'o}n del Grado}{609}
\contentsline {subsubsection}{Coeficiente de Clustering}{610}
\contentsline {subsection}{\numberline {17.3.2}Modelo de Barabasi-Albert}{610}
\contentsline {subsection}{\numberline {17.3.3}Modelo de Jackson-Rodgers}{611}
\contentsline {section}{\numberline {17.4}El Mundo Peque\IeC {\~n}o}{611}
\contentsline {subsection}{\numberline {17.4.1}Modelo de Watts Strogratz}{612}
\contentsline {section}{\numberline {17.5}La Clausura Triangular}{613}
\contentsline {subsection}{\numberline {17.5.1}Contando Tri\IeC {\'a}ngulos en un Grafo}{613}
\contentsline {subsection}{\numberline {17.5.2}Tri\IeC {\'a}ngulos Locales}{616}
\contentsline {subsection}{\numberline {17.5.3}EE Eigenplots}{617}
\contentsline {section}{\numberline {17.6}Centralidad}{618}
\contentsline {subsection}{\numberline {17.6.1}Grado}{619}
\contentsline {subsection}{\numberline {17.6.2}Betweenness}{619}
\contentsline {subsection}{\numberline {17.6.3}Closeness}{621}
\contentsline {subsection}{\numberline {17.6.4}Eigenvector Centrality y PageRank}{622}
\contentsline {subsection}{\numberline {17.6.5}Centralidad de Bonacich}{623}
\contentsline {subsection}{\numberline {17.6.6}Centralidad de la Red (Freeman)}{623}
\contentsline {section}{\numberline {17.7}Detecci\IeC {\'o}n de Comunidades}{624}
\contentsline {subsection}{\numberline {17.7.1}Uniones Fuertes y D\IeC {\'e}biles}{624}
\contentsline {subsection}{\numberline {17.7.2}Modularidad}{625}
\contentsline {subsubsection}{Algoritmo de Louvain}{625}
\contentsline {subsection}{\numberline {17.7.3}Algoritmo de Girwan-Newman}{627}
\contentsline {subsection}{\numberline {17.7.4}Clustering Espectral}{628}
\contentsline {subsection}{\numberline {17.7.5}Infomap}{628}
\contentsline {subsection}{\numberline {17.7.6}Comunidades con Solapamiento: Big Clam}{632}
\contentsline {section}{\numberline {17.8}An\IeC {\'a}lisis de comunidades en Redes Sociales}{634}
\contentsline {section}{\numberline {17.9}Cascadas en Redes Sociales}{636}
\contentsline {subsection}{\numberline {17.9.1}Cascadas}{636}
\contentsline {subsection}{\numberline {17.9.2}Maximizaci\IeC {\'o}n de Influencia}{639}
\contentsline {section}{\numberline {17.10}Redes Sociales con Signo}{640}
\contentsline {section}{\numberline {17.11}Motifs}{643}
\contentsline {section}{\numberline {17.12}El Lenguaje en las Redes Sociales}{645}
\contentsline {section}{\numberline {17.13}Evoluci\IeC {\'o}n de una Red Social}{646}
\contentsline {subsection}{\numberline {17.13.1}Cantidad de Nodos y Aristas}{646}
\contentsline {subsection}{\numberline {17.13.2}Di\IeC {\'a}metro}{648}
\contentsline {section}{\numberline {17.14}Grafos de Kronecker}{649}
\contentsline {subsection}{\numberline {17.14.1}Propiedades}{650}
\contentsline {subsection}{\numberline {17.14.2}Grafos de Kronecker Estoc\IeC {\'a}sticos}{651}
\contentsline {subsection}{\numberline {17.14.3}Estimaci\IeC {\'o}n de la matriz generadora}{651}
\contentsline {subsection}{\numberline {17.14.4}Generando Redes Sociales con Grafos de Kronecker}{652}
\contentsline {chapter}{\numberline {18}An\IeC {\'a}lisis Topol\IeC {\'o}gico de Datos}{655}
\contentsline {section}{\numberline {18.1}Propiedades del An\IeC {\'a}lisis Topol\IeC {\'o}gico}{656}
\contentsline {section}{\numberline {18.2}Los Datos como un Grafo}{657}
\contentsline {subsection}{\numberline {18.2.1}Complejo Vietoris-Rips}{657}
\contentsline {subsection}{\numberline {18.2.2}Complejo de Cech}{659}
\contentsline {section}{\numberline {18.3}Homolog\IeC {\'\i }a Persistente}{660}
\contentsline {subsection}{\numberline {18.3.1}N\IeC {\'u}meros de Betti y Barcodes}{661}
\contentsline {subsection}{\numberline {18.3.2}BarCodes}{662}
\contentsline {subsection}{\numberline {18.3.3}Otros puntos de An\IeC {\'a}lisis}{662}
\contentsline {section}{\numberline {18.4}Algoritmo Mapper}{663}
\contentsline {subsection}{\numberline {18.4.1}Lentes Topol\IeC {\'o}gicos Comunes}{664}
\contentsline {subsection}{\numberline {18.4.2}Cubriendo los Datos}{665}
\contentsline {subsection}{\numberline {18.4.3}Fase de Clustering}{667}
\contentsline {subsection}{\numberline {18.4.4}Construcci\IeC {\'o}n del Grafo}{667}
\contentsline {subsection}{\numberline {18.4.5}Ejemplo}{668}
\contentsline {subsection}{\numberline {18.4.6}Ejemplo}{671}
\contentsline {section}{\numberline {18.5}Conclusiones}{674}
\contentsline {part}{II\hspace {1em}Temas Adicionales}{677}
\contentsline {chapter}{\numberline {19}Patrones Frecuentes}{679}
\contentsline {section}{\numberline {19.1}A-Priori}{680}
\contentsline {subsection}{\numberline {19.1.1}Generaci\IeC {\'o}n de Candidatos Intermedia}{682}
\contentsline {section}{\numberline {19.2}PCY}{683}
\contentsline {section}{\numberline {19.3}Algoritmos Aleatorizados}{683}
\contentsline {subsection}{\numberline {19.3.1}Muestreo Aleatorio}{684}
\contentsline {subsection}{\numberline {19.3.2}Algoritmo SON}{684}
\contentsline {subsubsection}{SON en Map-Reduce}{685}
\contentsline {subsection}{\numberline {19.3.3}Algoritmo de Toivonen}{688}
\contentsline {section}{\numberline {19.4}Reglas de Asociaci\IeC {\'o}n}{689}
\contentsline {subsection}{\numberline {19.4.1}Soporte y Confianza en Reglas de Asociaci\IeC {\'o}n}{690}
\contentsline {subsubsection}{Soporte Alto, Confianza Alta}{690}
\contentsline {subsubsection}{Soporte Alto, Baja Confianza}{690}
\contentsline {subsubsection}{Soporte Bajo, Confianza Baja}{690}
\contentsline {subsubsection}{Soporte Bajo, Confianza Alta}{690}
\contentsline {subsection}{\numberline {19.4.2}Limitaciones del Soporte y Confianza para encontrar asociaciones interesantes}{690}
\contentsline {subsection}{\numberline {19.4.3}Lift}{691}
\contentsline {subsection}{\numberline {19.4.4}M\IeC {\'e}tricas invariantes a los valores nulos}{692}
\contentsline {chapter}{\numberline {20}Metadatos}{693}
\contentsline {section}{\numberline {20.1}Dublin Core}{694}
\contentsline {section}{\numberline {20.2}RDF}{695}
\contentsline {subsection}{\numberline {20.2.1}El Modelo RDF}{695}
\contentsline {subsection}{\numberline {20.2.2}RDF mediante Grafos}{696}
\contentsline {subsection}{\numberline {20.2.3}Recursos An\IeC {\'o}nimos}{696}
\contentsline {section}{\numberline {20.3}Turtle}{697}
\contentsline {section}{\numberline {20.4}Ontolog\IeC {\'\i }as}{698}
\contentsline {subsection}{\numberline {20.4.1}RDF-Schema (RDFs)}{698}
\contentsline {subsubsection}{Clases}{698}
\contentsline {subsubsection}{Domain y Range}{699}
\contentsline {subsection}{\numberline {20.4.2}Otros elementos de RDFs}{700}
\contentsline {section}{\numberline {20.5}SPARQL}{700}