diff --git a/.gitignore b/.gitignore index cb70c082..17b8dad8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,8 @@ index_files/ Introduction-to-`pyspark`.tex index.tex +tex_adjusted.tex +tex_adjusted.pdf docs/*.pdf *.aux *.log diff --git a/tex_adjusted.pdf b/tex_adjusted.pdf deleted file mode 100644 index e4be7b33..00000000 Binary files a/tex_adjusted.pdf and /dev/null differ diff --git a/tex_adjusted.tex b/tex_adjusted.tex deleted file mode 100644 index 307dc8f4..00000000 --- a/tex_adjusted.tex +++ /dev/null @@ -1,12759 +0,0 @@ -% Options for packages loaded elsewhere -\PassOptionsToPackage{unicode}{hyperref} -\PassOptionsToPackage{hyphens}{url} -\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor} -% -\documentclass[ - 11pt, - letterpaper, - DIV=11, - numbers=noendperiod]{scrreprt} - -\usepackage{amsmath,amssymb} -\usepackage{iftex} -\ifPDFTeX - \usepackage[T1]{fontenc} - \usepackage[utf8]{inputenc} - \usepackage{textcomp} % provide euro and other symbols -\else % if luatex or xetex - \usepackage{unicode-math} - \defaultfontfeatures{Scale=MatchLowercase} - \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1} -\fi -\usepackage{lmodern} -\ifPDFTeX\else - % xetex/luatex font selection -\fi -% Use upquote if available, for straight quotes in verbatim environments -\IfFileExists{upquote.sty}{\usepackage{upquote}}{} -\IfFileExists{microtype.sty}{% use microtype if available - \usepackage[]{microtype} - \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts -}{} -\makeatletter -\@ifundefined{KOMAClassName}{% if non-KOMA class - \IfFileExists{parskip.sty}{% - \usepackage{parskip} - }{% else - \setlength{\parindent}{0pt} - \setlength{\parskip}{6pt plus 2pt minus 1pt}} -}{% if KOMA class - \KOMAoptions{parskip=half}} -\makeatother -\usepackage{xcolor} -\setlength{\emergencystretch}{3em} % prevent overfull lines -\setcounter{secnumdepth}{5} -% Make \paragraph and \subparagraph free-standing -\ifx\paragraph\undefined\else - \let\oldparagraph\paragraph - \renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}} -\fi -\ifx\subparagraph\undefined\else - \let\oldsubparagraph\subparagraph - \renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}} -\fi - -\usepackage{color} -\usepackage{fancyvrb} -\newcommand{\VerbBar}{|} -\newcommand{\VERB}{\Verb[commandchars=\\\{\}]} -\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}} -% Add ',fontsize=\small' for more characters per line -\usepackage{framed} -\definecolor{shadecolor}{RGB}{241,243,245} -\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}} -\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}} -\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}} -\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}} -\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}} -\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}} -\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}} -\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}} -\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}} -\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}} -\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}} -\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}} -\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}} -\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}} -\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}} -\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}} -\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}} -\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}} -\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}} -\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}} - -\providecommand{\tightlist}{% - \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array} -\usepackage{calc} % for calculating minipage widths -% Correct order of tables after \paragraph or \subparagraph -\usepackage{etoolbox} -\makeatletter -\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{} -\makeatother -% Allow footnotes in longtable head/foot -\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}} -\makesavenoteenv{longtable} -\usepackage{graphicx} -\makeatletter -\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} -\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi} -\makeatother -% Scale images if necessary, so that they will not overflow the page -% margins by default, and it is still possible to overwrite the defaults -% using explicit options in \includegraphics[width, height, ...]{} -\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} -% Set default figure placement to htbp -\makeatletter -\def\fps@figure{htbp} -\makeatother -% definitions for citeproc citations -\NewDocumentCommand\citeproctext{}{} -\NewDocumentCommand\citeproc{mm}{% - \begingroup\def\citeproctext{#2}\cite{#1}\endgroup} -\makeatletter - % allow citations to break across lines - \let\@cite@ofmt\@firstofone - % avoid brackets around text for \cite: - \def\@biblabel#1{} - \def\@cite#1#2{{#1\if@tempswa , #2\fi}} -\makeatother -\newlength{\cslhangindent} -\setlength{\cslhangindent}{1.5em} -\newlength{\csllabelwidth} -\setlength{\csllabelwidth}{3em} -\newenvironment{CSLReferences}[2] % #1 hanging-indent, #2 entry-spacing - {\begin{list}{}{% - \setlength{\itemindent}{0pt} - \setlength{\leftmargin}{0pt} - \setlength{\parsep}{0pt} - % turn on hanging indent if param 1 is 1 - \ifodd #1 - \setlength{\leftmargin}{\cslhangindent} - \setlength{\itemindent}{-1\cslhangindent} - \fi - % set entry spacing - \setlength{\itemsep}{#2\baselineskip}}} - {\end{list}} -\usepackage{calc} -\newcommand{\CSLBlock}[1]{\hfill\break\parbox[t]{\linewidth}{\strut\ignorespaces#1\strut}} -\newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{\strut#1\strut}} -\newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{\strut#1\strut}} -\newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1} - -\usepackage[english]{babel} -\usepackage{fontspec} -\setmainfont{Times New Roman} -\setmonofont{Inconsolata-Regular.ttf}[ - Path = Fonts/Inconsolata/static/, - BoldFont = Inconsolata-Bold.ttf -] - - -\usepackage{geometry} -\geometry{ - paperwidth=7.125in, paperheight=10.250in, - top=1in, bottom=1in -} -\KOMAoption{captions}{tableheading} -\makeatletter -\@ifpackageloaded{bookmark}{}{\usepackage{bookmark}} -\makeatother -\makeatletter -\@ifpackageloaded{caption}{}{\usepackage{caption}} -\AtBeginDocument{% -\ifdefined\contentsname - \renewcommand*\contentsname{Table of contents} -\else - \newcommand\contentsname{Table of contents} -\fi -\ifdefined\listfigurename - \renewcommand*\listfigurename{List of Figures} -\else - \newcommand\listfigurename{List of Figures} -\fi -\ifdefined\listtablename - \renewcommand*\listtablename{List of Tables} -\else - \newcommand\listtablename{List of Tables} -\fi -\ifdefined\figurename - \renewcommand*\figurename{Figure} -\else - \newcommand\figurename{Figure} -\fi -\ifdefined\tablename - \renewcommand*\tablename{Table} -\else - \newcommand\tablename{Table} -\fi -} -\@ifpackageloaded{float}{}{\usepackage{float}} -\floatstyle{ruled} -\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]} -\floatname{codelisting}{Listing} -\newcommand*\listoflistings{\listof{codelisting}{List of Listings}} -\makeatother -\makeatletter -\makeatother -\makeatletter -\@ifpackageloaded{caption}{}{\usepackage{caption}} -\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}} -\makeatother -\ifLuaTeX - \usepackage{selnolig} % disable illegal ligatures -\fi -\usepackage{bookmark} - -\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available -\urlstyle{same} % disable monospaced font for URLs -\hypersetup{ - pdftitle={Introduction to pyspark}, - pdfauthor={Pedro Duarte Faria}, - colorlinks=true, - linkcolor={blue}, - filecolor={Maroon}, - citecolor={Blue}, - urlcolor={Blue}, - pdfcreator={LaTeX via pandoc}} - -\title{Introduction to \texttt{pyspark}} -\author{Pedro Duarte Faria} -\date{2024-01-09} - -\begin{document} -\maketitle - -\renewcommand*\contentsname{Table of contents} -{ -\hypersetup{linkcolor=} -\setcounter{tocdepth}{2} -\tableofcontents -} -\bookmarksetup{startatroot} - -\chapter*{Preface}\label{preface} -\addcontentsline{toc}{chapter}{Preface} - -\markboth{Preface}{Preface} - -\section*{About this book}\label{about-this-book} -\addcontentsline{toc}{section}{About this book} - -\markright{About this book} - -Hello! This book provides an introduction to -\href{https://spark.apache.org/docs/latest/api/python/}{\texttt{pyspark}}\footnote{\url{https://spark.apache.org/docs/latest/api/python/}}, -which is a Python API to \href{https://spark.apache.org/}{Apache -Spark}\footnote{\url{https://spark.apache.org/}}. Here, you will learn -how to perform the most commom data analysis tasks and useful data -transformations with Python to process huge amounts of data. - -In essence, \texttt{pyspark} is a python package that provides an API -for Apache Spark. In other words, with \texttt{pyspark} you are able to -use the python language to write Spark applications and run them on a -Spark cluster in a scalable and elegant way. This book focus on teaching -the fundamentals of \texttt{pyspark}, and how to use it for big data -analysis. - -This book, also contains a small introduction to key python concepts -that are important to understand how \texttt{pyspark} is organized. -Since we will be using Apache Spark under the hood, it is also very -important to understand a little bit of how Apache Spark works, so, we -provide a small introduction to Apache Spark as well. - -Big part of the knowledge exposed here is extracted from a lot of -practical experience of the author, working with \texttt{pyspark} to -analyze big data at platforms such as Databricks\footnote{\url{https://databricks.com/}}. -Another part of the knowledge is extracted from the official -documentation of Apache Spark (\emph{Apache Spark Official -Documentation} 2022), as well as some established works such as Chambers -and Zaharia (2018) and Damji et al. (2020). - -Some of the main subjects discussed in the book are: - -\begin{itemize} -\tightlist -\item - How an Apache Spark application works? -\item - What are Spark DataFrames? -\item - How to transform and model your Spark DataFrame. -\item - How to import data into Apache Spark. -\item - How to work with SQL inside \texttt{pyspark}. -\item - Tools for manipulating specific data types (e.g.~string, dates and - datetimes). -\item - How to use window functions. -\end{itemize} - -\section*{About the author}\label{about-the-author} -\addcontentsline{toc}{section}{About the author} - -\markright{About the author} - -Pedro Duarte Faria have a bachelor degree in Economics from Federal -University of Ouro Preto - Brazil. Currently, he is a Data Engineer at -\href{https://www.blip.ai/en/}{Blip}\footnote{\url{https://www.blip.ai/en/}}, -and an Associate Developer for Apache Spark 3.0 certified by Databricks. - -The author have more than 3 years of experience in the data analysis -market. He developed data pipelines, reports and analysis for research -institutions and some of the largest companies in the brazilian -financial sector, such as the BMG Bank, Sodexo and Pan Bank, besides -dealing with databases that go beyond the billion rows. - -Furthermore, Pedro is specialized on the R programming language, and -have given several lectures and courses about it, inside graduate -centers (such as PPEA-UFOP\footnote{\url{https://ppea.ufop.br/}}), in -addition to federal and state organizations (such as FJP-MG\footnote{\url{http://fjp.mg.gov.br/}}). -As researcher, he have experience in the field of Science, Technology -and Innovation Economics. - -Personal Website: \url{https://pedro-faria.netlify.app/} - -Twitter: \href{https://twitter.com/PedroPark9}{@PedroPark9} - -Mastodon: -\href{https://fosstodon.org/@pedropark99}{@pedropark99@fosstodon.org} - -\section*{Some conventions of this -book}\label{some-conventions-of-this-book} -\addcontentsline{toc}{section}{Some conventions of this book} - -\markright{Some conventions of this book} - -\subsection*{Python code and terminal -commands}\label{python-code-and-terminal-commands} -\addcontentsline{toc}{subsection}{Python code and terminal commands} - -This book is about \texttt{pyspark}, which is a python package. As a -result, we will be exposing a lot of python code across the entire book. -Examples of python code, are always shown inside a gray rectangle, like -this example below. - -Every visible result that this python code produce, will be written in -plain black outside of the gray rectangle, just below the command that -produced that visible result. So in the example below, the value -\texttt{729} is the only visible result of this python code, and, the -statement \texttt{print(y)} is the command that triggered this visible -result. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{x }\OperatorTok{=} \DecValTok{3} -\NormalTok{y }\OperatorTok{=} \DecValTok{9} \OperatorTok{**}\NormalTok{ x} - -\BuiltInTok{print}\NormalTok{(y)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -729 -\end{verbatim} - -Furthermore, all terminal commands that we expose in this book, will -always be: pre-fixed by \texttt{Terminal\$}; written in black; and, not -outlined by a gray rectangle. In the example below, the command -\texttt{pip\ install\ jupyter} should be inserted in the terminal of the -OS (whatever is the terminal that your OS uses), and not in the python -interpreter, because this command is prefixed with \texttt{Terminal\$}. - -\begin{verbatim} -Terminal$ pip install jupyter -\end{verbatim} - -Some terminal commands may produce visible results as well. In that -case, these results will be right below the respective command, and will -not be pre-fixed with \texttt{Terminal\$}. For example, we can see below -that the command \texttt{echo\ "Hello!"} produces the result -\texttt{"Hello!"}. - -\begin{verbatim} -Terminal$ echo "Hello!" -\end{verbatim} - -\begin{verbatim} -Hello! -\end{verbatim} - -\subsection*{Python objects, functions and -methods}\label{python-objects-functions-and-methods} -\addcontentsline{toc}{subsection}{Python objects, functions and methods} - -When I refer to some python object, function, method or package, I will -use a monospaced font. In other words, if I have a python object called -``name'', and, I am describing this object, I will use \texttt{name} in -the paragraph, and not ``name''. The same logic applies to Python -functions, methods and package names. - -\section*{Be aware of differences between -OS's!}\label{be-aware-of-differences-between-oss} -\addcontentsline{toc}{section}{Be aware of differences between OS's!} - -\markright{Be aware of differences between OS's!} - -Spark is available for all three main operational systems (or OS's) used -in the world (Windows, MacOs and Linux). I will use constantly the word -OS as an abbreviation to ``operational system''. - -The snippets of python code shown throughout this book should just run -correctly no matter which one of the three OS's you are using. In other -words, the python code snippets are made to be portable. So you can just -copy and paste them to your computer, no matter which OS you are using. - -But, at some points, I may need to show you some terminal commands that -are OS specific, and are not easily portable. For example, Linux have a -package manager, but Windows does not have one. This means that, if you -are on Linux, you will need to use some terminal commands to install -some necessary programs (like python). In contrast, if you are on -Windows, you will generally download executable files (\texttt{.exe}) -that make this installation for you. - -In cases like this, I will always point out the specific OS of each one -of the commands, or, I will describe the necessary steps to be made on -each one the OS's. Just be aware that these differences exists between -the OS's. - -\section*{Install the necessary -software}\label{install-the-necessary-software} -\addcontentsline{toc}{section}{Install the necessary software} - -\markright{Install the necessary software} - -If you want to follow the examples shown throughout this book, you must -have Apache Spark and \texttt{pyspark} installed on your machine. If you -do not know how to do this, you can consult the -\href{https://phoenixnap.com/kb/install-spark-on-ubuntu}{articles from -phoenixNAP which are very useful}\footnote{\url{https://phoenixnap.com/kb/install-spark-on-ubuntu}.}. - -\section*{Book's metadata}\label{books-metadata} -\addcontentsline{toc}{section}{Book's metadata} - -\markright{Book's metadata} - -\subsection*{License}\label{license} -\addcontentsline{toc}{subsection}{License} - -Copyright © 2024 Pedro Duarte Faria. This book is licensed by the -\href{https://creativecommons.org/licenses/by/4.0/}{CC-BY 4.0 Creative -Commons Attribution 4.0 International Public License}\footnote{\url{https://creativecommons.org/licenses/by/4.0/}}. - -\includegraphics[width=0.91667in,height=\textheight]{Figures/creative-commoms-88x31.png} - -\subsection*{Book citation}\label{book-citation} -\addcontentsline{toc}{subsection}{Book citation} - -You can use the following BibTex entry to cite this book: - -\begin{verbatim} -@book{pedro2024, - author = {Pedro Duarte Faria}, - title = {Introduction to pyspark}, - month = {January}, - year = {2024}, - address = {Belo Horizonte} -} -\end{verbatim} - -\subsection*{Corresponding author and -maintainer}\label{corresponding-author-and-maintainer} -\addcontentsline{toc}{subsection}{Corresponding author and maintainer} - -Pedro Duarte Faria - -Contact: -\href{mailto:pedropark99@gmail.com}{\nolinkurl{pedropark99@gmail.com}} - -Personal website: \url{https://pedro-faria.netlify.app/} - -\bookmarksetup{startatroot} - -\chapter{Key concepts of python}\label{key-concepts-of-python} - -If you have experience with python, and understands how objects and -classes works, you might want to skip this entire chapter. But, if you -are new to the language and do not have much experience with it, you -might want to stick a little bit, and learn a few key concepts that will -help you to understand how the \texttt{pyspark} package is organized, -and how to work with it. - -\section{Scripts}\label{scripts} - -Python programs are written in plain text files that are saved with the -\texttt{.py} extension. After you save these files, they are usually -called ``scripts''. So a script is just a text file that contains all -the commands that make your python program. - -There are many IDEs or programs that help you to write, manage, run and -organize this kind of files (like Microsoft Visual Studio -Code\footnote{\url{https://code.visualstudio.com/}}, PyCharm\footnote{\url{https://www.jetbrains.com/pycharm/}}, -Anaconda\footnote{\url{https://www.anaconda.com/products/distribution}} -and RStudio\footnote{\url{https://www.rstudio.com/}}). Many of these -programs are free to use, and, are easy to install. - -But, if you do not have any of them installed, you can just create a new -plain text file from the built-in Notepad program of your OS -(operational system), and, save it with the \texttt{.py} extension. - -\section{How to run a python program}\label{how-to-run-a-python-program} - -As you learn to write your Spark applications with \texttt{pyspark}, at -some point, you will want to actually execute this \texttt{pyspark} -program, to see its result. To do so, you need to execute it as a python -program. There are many ways to run a python program, but I will show -you the more ``standard'' way. That is to use the \texttt{python} -command inside the terminal of your OS (you need to have python already -installed). - -As an example, lets create a simple ``Hello world'' program. First, open -a new text file then save it somewhere in your machine (with the name -\texttt{hello.py}). Remember to save the file with the \texttt{.py} -extension. Then copy and paste the following command into this file: - -\begin{Shaded} -\begin{Highlighting}[] -\BuiltInTok{print}\NormalTok{(}\StringTok{"Hello World!"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -It will be much easier to run this script, if you open your OS's -terminal inside the folder where you save the \texttt{hello.py} file. -After you opened the terminal inside the folder, just run the -\texttt{python3\ hello.py} command. As a result, python will execute -\texttt{hello.py}, and, the text \texttt{Hello\ World!} should be -printed to the terminal: - -\begin{verbatim} -Terminal$ python3 hello.py -\end{verbatim} - -\begin{verbatim} -Hello World! -\end{verbatim} - -But, if for some reason you could not open the terminal inside the -folder, just open a terminal (in any way you can), then, use the -\texttt{cd} command (stands for ``change directory'') with the path to -the folder where you saved \texttt{hello.py}. This way, your terminal -will be rooted in this folder. - -For example, if I saved \texttt{hello.py} inside my Documents folder, -the path to this folder in Windows would be something like this: -\texttt{"C:\textbackslash{}Users\textbackslash{}pedro\textbackslash{}Documents"}. -On the other hand, this path on Linux would be something like -\texttt{"/usr/pedro/Documents"}. So the command to change to this -directory would be: - -\begin{verbatim} -# On Windows: -Terminal$ cd "C:\Users\pedro\Documents" -# On Linux: -Terminal$ cd "/usr/pedro/Documents" -\end{verbatim} - -After this \texttt{cd} command, you can run the -\texttt{python\ hello.py} command in the terminal, and get the exact -same result of the previous example. - -There you have it! So every time you need to run your python program (or -your \texttt{pyspark} program), just open a terminal and run the command -\texttt{python\ \textless{}complete\ path\ to\ your\ script\textgreater{}}. -If the terminal is rooted on the folder where you saved your script, you -can just use the -\texttt{python\ \textless{}name\ of\ the\ script\textgreater{}} command. - -\section{Objects}\label{objects} - -Although python is a general-purpose language, most of its features are -focused on object-oriented programming. Meaning that, python is a -programming language focused on creating, managing and modifying objects -and classes of objects. - -So, when you work with python, you are basically applying many -operations and functions over a set of objects. In essence, an object in -python, is a name that refers to a set of data. This data can be -anything that you computer can store (or represent). - -Having that in mind, an object is just a name, and this name is a -reference, or a key to access some data. To define an object in python, -you must use the assignment operator, which is the equal sign -(\texttt{=}). In the example below, we are defining, or, creating an -object called \texttt{x}, and it stores the value 10. Therefore, with -the name \texttt{x} we can access this value of 10. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{x }\OperatorTok{=} \DecValTok{10} -\BuiltInTok{print}\NormalTok{(x)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -10 -\end{verbatim} - -When we store a value inside an object, we can easily reuse this value -in multiple operations or expressions: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Multiply by 2} -\BuiltInTok{print}\NormalTok{(x }\OperatorTok{*} \DecValTok{2}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -20 -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Divide by 3} -\BuiltInTok{print}\NormalTok{(x }\OperatorTok{/} \DecValTok{3}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -3.3333333333333335 -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Print its class} -\BuiltInTok{print}\NormalTok{(}\BuiltInTok{type}\NormalTok{(x))} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} - -\end{verbatim} - -Remember, an object can store any type of value, or any type of data. -For example, it can store a single string, like the object -\texttt{salutation} below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{salutation }\OperatorTok{=} \StringTok{"Hello! My name is Pedro"} -\end{Highlighting} -\end{Shaded} - -Or, a list of multiple strings: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{names }\OperatorTok{=}\NormalTok{ [} - \StringTok{"Anne"}\NormalTok{, }\StringTok{"Vanse"}\NormalTok{, }\StringTok{"Elliot"}\NormalTok{,} - \StringTok{"Carlyle"}\NormalTok{, }\StringTok{"Ed"}\NormalTok{, }\StringTok{"Memphis"} -\NormalTok{]} - -\BuiltInTok{print}\NormalTok{(names)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['Anne', 'Vanse', 'Elliot', 'Carlyle', 'Ed', 'Memphis'] -\end{verbatim} - -Or a dict containing the description of a product: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{product }\OperatorTok{=}\NormalTok{ \{} - \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}Coca Cola\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}volume\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}2 litters\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}price\textquotesingle{}}\NormalTok{: }\FloatTok{2.52}\NormalTok{,} - \StringTok{\textquotesingle{}group\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}non{-}alcoholic drinks\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}department\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}drinks\textquotesingle{}} -\NormalTok{\}} - -\BuiltInTok{print}\NormalTok{(product)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -{'name': 'Coca Cola', 'volume': '2 litters', 'price': 2.52, 'grou -p': 'non-alcoholic drinks', 'department': 'drinks'} -\end{verbatim} - -And many other things\ldots{} - -\section{Expressions}\label{expressions} - -Python programs are organized in blocks of expressions (or statements). -A python expression is a statement that describes an operation to be -performed by the program. For example, the expression below describes -the sum between 3 and 5. - -\begin{Shaded} -\begin{Highlighting}[] -\DecValTok{3} \OperatorTok{+} \DecValTok{5} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -8 -\end{verbatim} - -The expression above is composed of numbers (like 3 and 5) and a -operator, more specifically, the sum operator (\texttt{+}). But any -python expression can include a multitude of different items. It can be -composed of functions (like \texttt{print()}, \texttt{map()} and -\texttt{str()}), constant strings (like \texttt{"Hello\ World!"}), -logical operators (like \texttt{!=}, \texttt{\textless{}}, -\texttt{\textgreater{}} and \texttt{==}), arithmetic operators (like -\texttt{*}, \texttt{/}, \texttt{**}, \texttt{\%}, \texttt{-} and -\texttt{+}), structures (like lists, arrays and dicts) and many other -types of commands. - -Below we have a more complex example, that contains the \texttt{def} -keyword (which starts a function definition; in the example below, this -new function being defined is \texttt{double()}), many built-in -functions (\texttt{list()}, \texttt{map()} and \texttt{print()}), a -arithmetic operator (\texttt{*}), numbers and a list (initiated by the -pair of brackets - \texttt{{[}{]}}). - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{def}\NormalTok{ double(x):} - \ControlFlowTok{return}\NormalTok{ x }\OperatorTok{*} \DecValTok{2} - -\BuiltInTok{print}\NormalTok{(}\BuiltInTok{list}\NormalTok{(}\BuiltInTok{map}\NormalTok{(double, [}\DecValTok{4}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{1}\NormalTok{])))} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[8, 4, 12, 2] -\end{verbatim} - -Python expressions are evaluated in a sequential manner (from top to -bottom of your python file). In other words, python runs the first -expression in the top of your file, them, goes to the second expression, -and runs it, them goes to the third expression, and runs it, and goes on -and on in that way, until it hits the end of the file. So, in the -example above, python executes the function definition (initiated at -\texttt{def\ double(x):}), before it executes the \texttt{print()} -statement, because the print statement is below the function definition. - -This order of evaluation is commonly referred as ``control flow'' in -many programming languages. Sometimes, this order can be a fundamental -part of the python program. Meaning that, sometimes, if we change the -order of the expressions in the program, we can produce unexpected -results (like an error), or change the results produced by the program. - -As an example, the program below prints the result 4, because the print -statement is executed before the expression \texttt{x\ =\ 40}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{x }\OperatorTok{=} \DecValTok{1} - -\BuiltInTok{print}\NormalTok{(x }\OperatorTok{*} \DecValTok{4}\NormalTok{)} - -\NormalTok{x }\OperatorTok{=} \DecValTok{40} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -4 -\end{verbatim} - -But, if we execute the expression \texttt{x\ =\ 40} before the print -statement, we then change the result produced by the program. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{x }\OperatorTok{=} \DecValTok{1} -\NormalTok{x }\OperatorTok{=} \DecValTok{40} - -\BuiltInTok{print}\NormalTok{(x }\OperatorTok{*} \DecValTok{4}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -160 -\end{verbatim} - -If we go a little further, and, put the print statement as the first -expression of the program, we then get a name error. This error warns us -that, the object named \texttt{x} is not defined (i.e.~it does not -exist). - -\begin{Shaded} -\begin{Highlighting}[] -\BuiltInTok{print}\NormalTok{(x }\OperatorTok{*} \DecValTok{4}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Traceback (most recent call last): - File "", line 1, in -NameError: name 'x' is not defined -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{x }\OperatorTok{=} \DecValTok{1} -\NormalTok{x }\OperatorTok{=} \DecValTok{40} -\end{Highlighting} -\end{Shaded} - -This error occurs, because inside the print statement, we call the name -\texttt{x}. But, this is the first expression of the program, and at -this point of the program, we did not defined a object called -\texttt{x}. We make this definition, after the print statement, with -\texttt{x\ =\ 1} and \texttt{x\ =\ 40}. In other words, at this point, -python do not know any object called \texttt{x}. - -\section{Packages (or libraries)}\label{packages-or-libraries} - -A python package (or a python ``library'') is basically a set of -functions and classes that provides important functionality to solve a -specific problem. And \texttt{pyspark} is one of these many python -packages available. - -Python packages are usually published (that is, made available to the -public) through the PyPI archive\footnote{\url{https://pypi.org/}}. If a -python package is published in PyPI, then, you can easily install it -through the \texttt{pip} tool. - -To use a python package, you always need to: 1) have this package -installed on your machine; 2) import this package in your python script. -If a package is not installed in your machine, you will face a -\texttt{ModuleNotFoundError} as you try to use it, like in the example -below. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ pandas} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Traceback (most recent call last): - File "", line 1, in -ModuleNotFoundError: No module named 'pandas' -\end{verbatim} - -If your program produce this error, is very likely that you are trying -to use a package that is not currently installed on your machine. To -install it, you may use the -\texttt{pip\ install\ \textless{}name\ of\ the\ package\textgreater{}} -command on the terminal of your OS. - -\begin{verbatim} -Terminal$ pip install pandas -\end{verbatim} - -But, if this package is already installed in your machine, then, you can -just import it to your script. To do this, you just include an -\texttt{import} statement at the start of your python file. For example, -if I want to use the \texttt{DataFrame} function from the -\texttt{pandas} package: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Now that I installed the \textasciigrave{}pandas\textasciigrave{} package with \textasciigrave{}pip\textasciigrave{}} -\CommentTok{\# this \textasciigrave{}import\textasciigrave{} statement works without any errors:} -\ImportTok{import}\NormalTok{ pandas} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ pandas.DataFrame([} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{3214}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{4510}\NormalTok{), } -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{9082}\NormalTok{), (}\DecValTok{4}\NormalTok{, }\DecValTok{7822}\NormalTok{)} -\NormalTok{])} - -\BuiltInTok{print}\NormalTok{(df)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} - 0 1 -0 1 3214 -1 2 4510 -2 1 9082 -3 4 7822 -\end{verbatim} - -Therefore, with \texttt{import\ pandas} I can access any of the -functions available in the \texttt{pandas} package, by using the dot -operator after the name of the package -(\texttt{pandas.\textless{}name\ of\ the\ function\textgreater{}}). -However, it can become very annoying to write \texttt{pandas.} every -time you want to access a function from \texttt{pandas}, specially if -you use it constantly in your code. - -To make life a little easier, python offers some alternative ways to -define this \texttt{import} statement. First, you can give an alias to -this package that is shorter/easier to write. As an example, nowadays, -is virtually a industry standard to import the \texttt{pandas} package -as \texttt{pd}. To do this, you use the \texttt{as} keyword in your -\texttt{import} statement. This way, you can access the \texttt{pandas} -functionality with -\texttt{pd.\textless{}name\ of\ the\ function\textgreater{}}: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ pd.DataFrame([} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{3214}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{4510}\NormalTok{), } -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{9082}\NormalTok{), (}\DecValTok{4}\NormalTok{, }\DecValTok{7822}\NormalTok{)} -\NormalTok{])} - -\BuiltInTok{print}\NormalTok{(df)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} - 0 1 -0 1 3214 -1 2 4510 -2 1 9082 -3 4 7822 -\end{verbatim} - -In contrast, if you want to make your life even easier and produce a -more ``clean'' code, you can import (from the package) just the -functions that you need to use. In this method, you can eliminate the -dot operator, and refer directly to the function by its name. To use -this method, you include the \texttt{from} keyword in your import -statement, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pandas }\ImportTok{import}\NormalTok{ DataFrame} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ DataFrame([} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{3214}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{4510}\NormalTok{), } -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{9082}\NormalTok{), (}\DecValTok{4}\NormalTok{, }\DecValTok{7822}\NormalTok{)} -\NormalTok{])} - -\BuiltInTok{print}\NormalTok{(df)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} - 0 1 -0 1 3214 -1 2 4510 -2 1 9082 -3 4 7822 -\end{verbatim} - -Just to be clear, you can import multiple functions from the package, by -listing them. Or, if you prefer, you can import all components of the -package (or module/sub-module) by using the star shortcut (\texttt{*}): - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Import \textasciigrave{}search()\textasciigrave{}, \textasciigrave{}match()\textasciigrave{} and \textasciigrave{}compile()\textasciigrave{} functions:} -\ImportTok{from}\NormalTok{ re }\ImportTok{import}\NormalTok{ search, match, }\BuiltInTok{compile} -\CommentTok{\# Import all functions from the \textasciigrave{}os\textasciigrave{} package} -\ImportTok{from}\NormalTok{ os }\ImportTok{import} \OperatorTok{*} -\end{Highlighting} -\end{Shaded} - -Some packages may be very big, and includes many different functions and -classes. As the size of the package becomes bigger and bigger, -developers tend to divide this package in many ``modules''. In other -words, the functions and classes of this python package are usually -organized in ``modules''. - -As an example, the \texttt{pyspark} package is a fairly large package, -that contains many classes and functions. Because of it, the package is -organized in a number of modules, such as \texttt{sql} (to access Spark -SQL), \texttt{pandas} (to access the Pandas API of Spark), \texttt{ml} -(to access Spark MLib). - -To access the functions available in each one of these modules, you use -the dot operator between the name of the package and the name of the -module. For example, to import all components from the \texttt{sql} and -\texttt{pandas} modules of \texttt{pyspark}, you would do this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import} \OperatorTok{*} -\ImportTok{from}\NormalTok{ pyspark.pandas }\ImportTok{import} \OperatorTok{*} -\end{Highlighting} -\end{Shaded} - -Going further, we can have sub-modules (or modules inside a module) too. -As an example, the \texttt{sql} module of \texttt{pyspark} have the -\texttt{functions} and \texttt{window} sub-modules. To access these -sub-modules, you use the dot operator again: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Importing \textasciigrave{}functions\textasciigrave{} and \textasciigrave{}window\textasciigrave{} sub{-}modules:} -\ImportTok{import}\NormalTok{ pyspark.sql.functions }\ImportTok{as}\NormalTok{ F} -\ImportTok{import}\NormalTok{ pyspark.sql.window }\ImportTok{as}\NormalTok{ W} -\end{Highlighting} -\end{Shaded} - -\section{Methods versus Functions}\label{methods-versus-functions} - -Beginners tend mix these two types of functions in python, but they are -not the same. So lets describe the differences between the two. - -Standard python functions, are \textbf{functions that we apply over an -object}. A classical example, is the \texttt{print()} function. You can -see in the example below, that we are applying \texttt{print()} over the -\texttt{result} object. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{result }\OperatorTok{=} \DecValTok{10} \OperatorTok{+} \DecValTok{54} -\BuiltInTok{print}\NormalTok{(result)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -64 -\end{verbatim} - -Other examples of a standard python function would be \texttt{map()} and -\texttt{list()}. See in the example below, that we apply the -\texttt{map()} function over a set of objects: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{words }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}apple\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}star\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}abc\textquotesingle{}}\NormalTok{]} -\NormalTok{lengths }\OperatorTok{=} \BuiltInTok{map}\NormalTok{(}\BuiltInTok{len}\NormalTok{, words)} -\BuiltInTok{list}\NormalTok{(lengths)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[5, 4, 3] -\end{verbatim} - -In contrast, a python method is a function registered inside a python -class. In other words, this function \textbf{belongs to the class -itself}, and cannot be used outside of it. This means that, in order to -use a method, you need to have an instance of the class where it is -registered. - -For example, the \texttt{startswith()} method belongs to the -\texttt{str} class (this class is used to represent strings in python). -So to use this method, we need to have an instance of this class saved -in a object that we can access. Note in the example below, that we -access the \texttt{startswith()} method through the \texttt{name} -object. This means that, \texttt{startswith()} is a function. But, we -cannot use it without an object of class \texttt{str}, like -\texttt{name}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{name }\OperatorTok{=} \StringTok{"Pedro"} -\NormalTok{name.startswith(}\StringTok{"P"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -True -\end{verbatim} - -Note in the example above, that we access any class method in the same -way that we would access a sub-module/module of a package. That is, by -using the dot operator (\texttt{.}). - -So, if we have a class called \texttt{people}, and, this class has a -method called \texttt{location()}, we can use this \texttt{location()} -method by using the dot operator (\texttt{.}) with the name of an object -of class \texttt{people}. If an object called \texttt{x} is an instance -of \texttt{people} class, then, we can do \texttt{x.location()}. - -But if this object \texttt{x} is of a different class, like -\texttt{int}, then we can no longer use the \texttt{location()} method, -because this method does not belong to the \texttt{int} class. For -example, if your object is from class \texttt{A}, and, you try to use a -method of class \texttt{B}, you will get an \texttt{AttributeError}. - -In the example exposed below, I have an object called \texttt{number} of -class \texttt{int}, and, I try to use the method \texttt{startswith()} -from \texttt{str} class with this object: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{number }\OperatorTok{=} \DecValTok{2} -\CommentTok{\# You can see below that, the \textasciigrave{}x\textasciigrave{} object have class \textasciigrave{}int\textasciigrave{}} -\BuiltInTok{type}\NormalTok{(number)} -\CommentTok{\# Trying to use a method from \textasciigrave{}str\textasciigrave{} class} -\NormalTok{number.startswith(}\StringTok{"P"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AttributeError: 'int' object has no attribute 'startswith' -\end{verbatim} - -\section{Identifying classes and their -methods}\label{identifying-classes-and-their-methods} - -Over the next chapters, you will realize that \texttt{pyspark} programs -tend to use more methods than standard functions. So most of the -functionality of \texttt{pyspark} resides in class methods. As a result, -the capability of understanding the objects that you have in your python -program, and, identifying its classes and methods will be crucial while -you are developing and debugging your Spark applications. - -Every existing object in python represents an instance of a class. In -other words, every object in python is associated to a given class. You -can always identify the class of an object, by applying the -\texttt{type()} function over this object. In the example below, we can -see that, the \texttt{name} object is an instance of the \texttt{str} -class. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{name }\OperatorTok{=} \StringTok{"Pedro"} -\BuiltInTok{type}\NormalTok{(name)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -str -\end{verbatim} - -If you do not know all the methods that a class have, you can always -apply the \texttt{dir()} function over this class to get a list of all -available methods. For example, lets suppose you wanted to see all -methods from the \texttt{str} class. To do so, you would do this: - -\begin{Shaded} -\begin{Highlighting}[] -\BuiltInTok{dir}\NormalTok{(}\BuiltInTok{str}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['__add__', '__class__', '__contains__', '__delattr__', - '__dir__', '__doc__', '__eq__', '__format__', '__ge__', - '__getattribute__', '__getitem__', '__getnewargs__', - '__gt__', '__hash__', '__init__', '__init_subclass__', - '__iter__', '__le__', '__len__', '__lt__', '__mod__', - '__mul__', '__ne__', '__new__', '__reduce__', - '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', - '__setattr__', '__sizeof__', '__str__', '__subclasshook__', - 'capitalize', 'casefold', 'center', 'count', 'encode', - 'endswith', 'expandtabs', 'find', 'format', 'format_map', - 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', - 'isdigit', 'isidentifier', 'islower', 'isnumeric', - 'isprintable', 'isspace', 'istitle', 'isupper', 'join', - 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', - 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', - 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', - 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill'] -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{Introducing Apache Spark}\label{sec-introd-spark} - -In essence, \texttt{pyspark} is an API to Apache Spark (or simply -Spark). In other words, with \texttt{pyspark} we can build Spark -applications using the python language. So, by learning a little more -about Spark, you will understand a lot more about \texttt{pyspark}. - -\section{What is Spark?}\label{what-is-spark} - -Spark is a multi-language engine for large-scale data processing that -supports both single-node machines and clusters of machines -(\emph{Apache Spark Official Documentation} 2022). Nowadays, Spark -became the de facto standard for structure and manage big data -applications. - -It has a number of features that its predecessors did not have, like the -capacity for in-memory processing and stream processing (Karau et al. -2015). But, the most important feature of all, is that Spark is an -\textbf{unified platform} for big data processing (Chambers and Zaharia -2018). - -This means that Spark comes with multiple built-in libraries and tools -that deals with different aspects of the work with big data. It has a -built-in SQL engine\footnote{\url{https://spark.apache.org/sql/}} for -performing large-scale data processing; a complete library for scalable -machine learning (\texttt{MLib}\footnote{\url{https://spark.apache.org/docs/latest/ml-guide.html}}); -a stream processing engine\footnote{\url{https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html\#overview}} -for streaming analytics; and much more; - -In general, big companies have many different data necessities, and as a -result, the engineers and analysts may have to combine and integrate -many tools and techniques together, so they can build many different -data pipelines to fulfill these necessities. But this approach can -create a very serious dependency problem, which imposes a great barrier -to support this workflow. This is one of the big reasons why Spark got -so successful. It eliminates big part of this problem, by already -including almost everything that you might need to use. - -\begin{quote} -Spark is designed to cover a wide range of workloads that previously -required separate distributed systems \ldots{} By supporting these -workloads in the same engine, Spark makes it easy and inexpensive to -combine different processing types, which is often necessary in -production data analysis pipelines. In addition, it reduces the -management burden of maintaining separate tools (Karau et al. 2015). -\end{quote} - -\section{Spark application}\label{spark-application} - -Your personal computer can do a lot of things, but, it cannot -efficiently deal with huge amounts of data. For this situation, we need -several machines working together, adding up their resources to deal -with the volume or complexity of the data. Spark is the framework that -coordinates the computations across this set of machines (Chambers and -Zaharia 2018). Because of this, a relevant part of Spark's structure is -deeply connected to distributed computing models. - -You probably do not have a cluster of machines at home. So, while -following the examples in this book, you will be running Spark on a -single machine (i.e.~single node mode). But lets just forget about this -detail for a moment. - -In every Spark application, you always have a single machine behaving as -the driver node, and multiple machines behaving as the worker nodes. The -driver node is responsible for managing the Spark application, -i.e.~asking for resources, distributing tasks to the workers, collecting -and compiling the results, \ldots. The worker nodes are responsible for -executing the tasks that are assigned to them, and they need to send the -results of these tasks back to the driver node. - -Every Spark application is distributed into two different and -independent processes: 1) a driver process; 2) and a set of executor -processes (Chambers and Zaharia 2018). The driver process, or, the -driver program, is where your application starts, and it is executed by -the driver node. This driver program is responsible for: 1) maintaining -information about your Spark Application; 2) responding to a user's -program or input; 3) and analyzing, distributing, and scheduling work -across the executors (Chambers and Zaharia 2018). - -Every time a Spark application starts, the driver process has to -communicate with the cluster manager, to acquire workers to perform the -necessary tasks. In other words, the cluster manager decides if Spark -can use some of the resources (i.e.~some of the machines) of the -cluster. If the cluster manager allow Spark to use the nodes it needs, -the driver program will break the application into many small tasks, and -will assign these tasks to the worker nodes. - -The executor processes, are the processes that take place within each -one of the worker nodes. Each executor process is composed of a set of -tasks, and the worker node is responsible for performing and executing -these tasks that were assigned to him, by the driver program. After -executing these tasks, the worker node will send the results back to the -driver node (or the driver program). If they need, the worker nodes can -communicate with each other, while performing its tasks. - -This structure is represented in Figure~\ref{fig-spark-application}: - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/../Figures/spark-application.png} - -} - -\caption{\label{fig-spark-application}Spark application structure on a -cluster of computers} - -\end{figure}% - -When you run Spark on a cluster of computers, you write the code of your -Spark application (i.e.~your \texttt{pyspark} code) on your (single) -local computer, and then, submit this code to the driver node. After -that, the driver node takes care of the rest, by starting your -application, creating your Spark Session, asking for new worker nodes, -sending the tasks to be performed, collecting and compiling the results -and giving back these results to you. - -However, when you run Spark on your (single) local computer, the process -is very similar. But, instead of submitting your code to another -computer (which is the driver node), you will submit to your own local -computer. In other words, when Spark is running on single-node mode, -your computer becomes the driver and the worker node at the same time. - -\section{\texorpdfstring{Spark application versus \texttt{pyspark} -application}{Spark application versus pyspark application}}\label{spark-application-versus-pyspark-application} - -The \texttt{pyspark} package is just a tool to write Spark applications -using the python programming language. This means, that every -\texttt{pyspark} application is a Spark application written in python. - -With this conception in mind, you can understand that a \texttt{pyspark} -application is a description of a Spark application. When we compile (or -execute) our python program, this description is translated into a raw -Spark application that will be executed by Spark. - -To write a \texttt{pyspark} application, you write a python script that -uses the \texttt{pyspark} library. When you execute this python script -with the python interpreter, the application will be automatically -converted to Spark code, and will be sent to Spark to be executed across -the cluster; - -\section{\texorpdfstring{Core parts of a \texttt{pyspark} -program}{Core parts of a pyspark program}}\label{core-parts-of-a-pyspark-program} - -In this section, I want to point out the core parts that composes every -\texttt{pyspark} program. This means that every \texttt{pyspark} program -that you write will have these ``core parts'', which are: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi})} -\item - importing the \texttt{pyspark} package (or modules); -\item - starting your Spark Session; -\item - defining a set of transformations and actions over Spark DataFrames; -\end{enumerate} - -\subsection{\texorpdfstring{Importing the \texttt{pyspark} package (or -modules)}{Importing the pyspark package (or modules)}}\label{importing-the-pyspark-package-or-modules} - -Spark comes with a lot of functionality installed. But, in order to use -it in your \texttt{pyspark} program, you have to import most of these -functionalities to your session. This means that you have to import -specific packages (or ``modules'') of \texttt{pyspark} to your python -session. - -For example, most of the functions used to define our transformations -and aggregations in Spark DataFrames, comes from the -\texttt{pyspark.sql.functions} module. - -That is why we usually start our python scripts by importing functions -from this module, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import} \BuiltInTok{sum}\NormalTok{, col} -\NormalTok{sum\_expr }\OperatorTok{=} \BuiltInTok{sum}\NormalTok{(col(}\StringTok{\textquotesingle{}Value\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -Or, importing the entire module with the \texttt{import} keyword, like -this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ pyspark.sql.functions }\ImportTok{as}\NormalTok{ F} -\NormalTok{sum\_expr }\OperatorTok{=}\NormalTok{ F.}\BuiltInTok{sum}\NormalTok{(F.col(}\StringTok{\textquotesingle{}Value\textquotesingle{}}\NormalTok{))} -\end{Highlighting} -\end{Shaded} - -\subsection{Starting your Spark -Session}\label{starting-your-spark-session} - -Every Spark application starts with a Spark Session. Basically, the -Spark Session is the entry point to your application. This means that, -in every \texttt{pyspark} program that you write, \textbf{you should -always start by defining your Spark Session}. We do this, by using the -\texttt{getOrCreate()} method from -\texttt{pyspark.sql.SparkSession.builder} module. - -Just store the result of this method in any python object. Is very -common to name this object as \texttt{spark}, like in the example below. -This way, you can access all the information and methods of Spark from -this \texttt{spark} object. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} -\end{Highlighting} -\end{Shaded} - -\subsection{Defining a set of transformations and -actions}\label{defining-a-set-of-transformations-and-actions} - -Every \texttt{pyspark} program is composed by a set of transformations -and actions over a set of Spark DataFrames. - -I will explain Spark DataFrames in more deth on the -Chapter~\ref{sec-dataframes-chapter}. For now just understand that they -are the basic data sctructure that feed all \texttt{pyspark} programs. -In other words, on every \texttt{pyspark} program we are transforming -multiple Spark DataFrames to get the result we want. - -As an example, in the script below we begin with the Spark DataFrame -stored in the object \texttt{students}, and, apply multiple -transformations over it to build the \texttt{ar\_department} DataFrame. -Lastly, we apply the \texttt{.show()} action over the -\texttt{ar\_department} DataFrame: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\CommentTok{\# Apply some transformations over} -\CommentTok{\# the \textasciigrave{}students\textasciigrave{} DataFrame:} -\NormalTok{ar\_department }\OperatorTok{=}\NormalTok{ students}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}Age\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \DecValTok{22}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}IsArDepartment\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}Department\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \StringTok{\textquotesingle{}AR\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(col(}\StringTok{\textquotesingle{}Age\textquotesingle{}}\NormalTok{).desc())} - - -\CommentTok{\# Apply the \textasciigrave{}.show()\textasciigrave{} action} -\CommentTok{\# over the \textasciigrave{}ar\_department\textasciigrave{} DataFrame:} -\NormalTok{ar\_department.show()} -\end{Highlighting} -\end{Shaded} - -\section{Building your first Spark -application}\label{building-your-first-spark-application} - -To demonstrate what a \texttt{pyspark} program looks like, lets write -and run our first example of a Spark application. This Spark application -will build a simple table of 1 column that contains 5 numbers, and then, -it will return a simple python list containing this five numbers as a -result. - -\subsection{Writing the code}\label{writing-the-code} - -First, create a new blank text file in your computer, and save it -somewhere with the name \texttt{spark-example.py}. Do not forget to put -the \texttt{.py} extension in the name. This program we are writing -together is a python program, and should be treated as such. With the -\texttt{.py} extension in the name file, you are stating this fact quite -clearly to your computer. - -After you created and saved the python script (i.e.~the text file with -the \texttt{.py} extension), you can start writing your \texttt{pyspark} -program. As we noted in the previous section, you should always start -your \texttt{pyspark} program by defining your Spark Session, with this -code: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} -\end{Highlighting} -\end{Shaded} - -After you defined your Spark Session, and saved it in an object called -\texttt{spark}, you can now access all Spark's functionality through -this \texttt{spark} object. - -To create our first Spark table we use the \texttt{range()} method from -the \texttt{spark} object. The \texttt{range()} method works similarly -as the standard python function called \texttt{range()}. It basically -creates a sequence of numbers, from 0 to \(n - 1\). However, this -\texttt{range()} method from \texttt{spark} stores this sequence of -numbers as rows in a Spark table (or a Spark DataFrame): - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{table }\OperatorTok{=}\NormalTok{ spark.}\BuiltInTok{range}\NormalTok{(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -After this step, we want to collect all the rows of the resulting table -into a python list. And to do that, we use the \texttt{collect()} method -from the Spark table: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{result }\OperatorTok{=}\NormalTok{ table.collect()} -\BuiltInTok{print}\NormalTok{(result)} -\end{Highlighting} -\end{Shaded} - -So, the entire program is composed of these three parts (or sections) of -code. If you need it, the entire program is reproduced below. You can -copy and paste all of this code to your python script, and then, save -it: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# The entire program:} -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} - -\NormalTok{table }\OperatorTok{=}\NormalTok{ spark.}\BuiltInTok{range}\NormalTok{(}\DecValTok{5}\NormalTok{)} -\NormalTok{result }\OperatorTok{=}\NormalTok{ table.collect()} -\BuiltInTok{print}\NormalTok{(result)} -\end{Highlighting} -\end{Shaded} - -\subsection{Executing the code}\label{executing-the-code} - -Now that you have written your first Spark application with -\texttt{pyspark}, you want to execute this application and see its -results. Yet, to run a \texttt{pyspark} program, remember that you need -to have the necessary software installed on your machine. In case you do -not have Apache Spark installed yet, I personally recommend you to read -the \href{https://phoenixnap.com/kb/install-spark-on-ubuntu}{articles -from PhoenixNAP on how to install Apache Spark}\footnote{\url{https://phoenixnap.com/kb/install-spark-on-ubuntu}.}. - -Anyway, to execute this \texttt{pyspark} that you wrote, you need send -this script to the python interpreter, and to do this you need to: 1) -open a terminal inside the folder where you python script is stored; -and, 2) use the python command from the terminal with the name of your -python script. - -In my current situation, I running Spark on a Ubuntu distribution, and, -I saved the \texttt{spark-example.py} script inside a folder called -\texttt{SparkExample}. This means that, I need to open a terminal that -is rooted inside this \texttt{SparkExample} folder. - -You probably have saved your \texttt{spark-example.py} file in a -different folder of your computer. This means that you need to open the -terminal from a different folder. - -After I opened a terminal rooted inside the \texttt{SparkExample} -folder. I just use the \texttt{python3} command to access the python -interpreter, and, give the name of the python script that I want to -execute. In this case, the \texttt{spark-example.py} file. As a result, -our first \texttt{pyspark} program will be executed: - -\begin{verbatim} -Terminal$ python3 spark-example.py -\end{verbatim} - -\begin{verbatim} -[Row(id=0), Row(id=1), Row(id=2), Row(id=3), Row(id=4)] -\end{verbatim} - -You can see in the above result, that this Spark application produces a -sequence of \texttt{Row} objects, inside a Python list. Each row object -contains a number from 0 to 4. - -Congratulations! You have just run your first Spark application using -\texttt{pyspark}! - -\section{\texorpdfstring{Overview of -\texttt{pyspark}}{Overview of pyspark}}\label{overview-of-pyspark} - -Before we continue, I want to give you a very brief overview of the main -parts of \texttt{pyspark} that are the most useful and most important to -know of. - -\subsection{Main python modules}\label{main-python-modules} - -The main python modules that exists in \texttt{pyspark} are: - -\begin{itemize} -\tightlist -\item - \texttt{pyspark.sql.SparkSession}: the \texttt{SparkSession} class - that defines your Spark Session, or, the entry point to your Spark - application; -\item - \texttt{pyspark.sql.dataframe}: module that defines the - \texttt{DataFrame} class; -\item - \texttt{pyspark.sql.column}: module that defines the \texttt{Column} - class; -\item - \texttt{pyspark.sql.types}: module that contains all data types of - Spark; -\item - \texttt{pyspark.sq.functions}: module that contains all of the main - Spark functions that we use in transformations; -\item - \texttt{pyspark.sql.window}: module that defines the \texttt{Window} - class, which is responsible for defining windows in a Spark DataFrame; -\end{itemize} - -\subsection{Main python classes}\label{main-python-classes} - -The main python classes that exists in \texttt{pyspark} are: - -\begin{itemize} -\item - \texttt{DataFrame}: represents a Spark DataFrame, and it is the main - data structure in \texttt{pyspark}. In essence, they represent a - collection of datasets into named columns; -\item - \texttt{Column}: represents a column in a Spark DataFrame; -\item - \texttt{GroupedData}: represents a grouped Spark DataFrame (result of - \texttt{DataFrame.groupby()}); -\item - \texttt{Window}: describes a window in a Spark DataFrame; -\item - \texttt{DataFrameReader} and \texttt{DataFrameWriter}: classes - responsible for reading data from a data source into a Spark - DataFrame, and writing data from a Spark DataFrame into a data source; -\item - \texttt{DataFrameNaFunctions}: class that stores all main methods for - dealing with null values (i.e.~missing data); -\end{itemize} - -\bookmarksetup{startatroot} - -\chapter{Introducing Spark DataFrames}\label{sec-dataframes-chapter} - -In this chapter, you will understand how Spark represents and manages -tables (or tabular data). Different programming languages and frameworks -use different names to describe a table. But, in Apache Spark, they are -referred as Spark DataFrames. - -In \texttt{pyspark}, these DataFrames are stored inside python objects -of class \texttt{pyspark.sql.dataframe.DataFrame}, and all the methods -present in this class, are commonly referred as the DataFrame API of -Spark. This is the most important API of Spark, because much of your -Spark applications will heavily use this API to compose your data -transformations and data flows (Chambers and Zaharia 2018). - -\section{Spark DataFrames versus Spark -Datasets}\label{spark-dataframes-versus-spark-datasets} - -Spark have two notions of structured data: DataFrames and Datasets. In -summary, a Spark Dataset, is a distributed collection of data -(\emph{Apache Spark Official Documentation} 2022). In contrast, a Spark -DataFrame is a Spark Dataset organized into named columns (\emph{Apache -Spark Official Documentation} 2022). - -This means that, Spark DataFrames are very similar to tables as we know -in relational databases - RDBMS, or, in spreadsheets (like Excel). So in -a Spark DataFrame, each column has a name, and they all have the same -number of rows. Furthermore, all the rows inside a column must store the -same type of data, but each column can store a different type of data. - -In the other hand, Spark Datasets are considered a collection of any -type of data. So a Dataset might be a collection of unstructured data as -well, like log files, JSON and XML trees, etc. Spark Datasets can be -created and transformed trough the Dataset API of Spark. But this API is -available only in Scala and Java API's of Spark. For this reason, we do -not act directly on Datasets with \texttt{pyspark}, only DataFrames. -That's ok, because for the most part of applications, we do want to use -DataFrames, and not Datasets, to represent our data. - -However, what makes a Spark DataFrame different from other dataframes? -Like the \texttt{pandas} DataFrame? Or the R native \texttt{data.frame} -structure? Is the \textbf{distributed} aspect of it. Spark DataFrames -are based on Spark Datasets, and these Datasets are collections of data -that are distributed across the cluster. As an example, lets suppose you -have the following table stored as a Spark DataFrame: - -\begin{longtable}[]{@{}lll@{}} -\toprule\noalign{} -ID & Name & Value \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -1 & Anne & 502 \\ -2 & Carls & 432 \\ -3 & Stoll & 444 \\ -4 & Percy & 963 \\ -5 & Martha & 123 \\ -6 & Sigrid & 621 \\ -\end{longtable} - -If you are running Spark in a 4 nodes cluster (one is the driver node, -and the other three are worker nodes). Each worker node of the cluster -will store a section of this data. So you, as the programmer, will see, -manage and transform this table as if it was a single and unified table. -But behind the hoods, Spark will split this data and store it as many -fragments across the Spark cluster. Figure~\ref{fig-distributed-df} -presents this notion in a visual manner. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/../Figures/distributed-df.png} - -} - -\caption{\label{fig-distributed-df}A Spark DataFrame is distributed -across the cluster} - -\end{figure}% - -\section{Partitions of a Spark -DataFrame}\label{sec-dataframe-partitions} - -A Spark DataFrame is always broken into many small pieces, and, these -pieces are always spread across the cluster of machines. Each one of -these small pieces of the total data are considered a DataFrame -\emph{partition}. - -For the most part, you do not manipulate these partitions manually or -individually (Karau et al. 2015), because Spark automatically do this -job for you. - -As we exposed in Figure~\ref{fig-distributed-df}, each node of the -cluster will hold a piece of the total DataFrame. If we translate this -distribution into a ``partition'' distribution, this means that each -node of the cluster can hold one or multiple partitions of the Spark -DataFrame. - -If we sum all partitions present in a node of the cluster, we get a -chunk of the total DataFrame. The figure below demonstrates this notion: - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/../Figures/partitions-df.png} - -} - -\caption{\label{fig-partitions-df}Partitions of a DataFrame} - -\end{figure}% - -If the Spark DataFrame is not big, each node of the cluster will -probably store just a single partition of this DataFrame. In contrast, -depending on the complexity and size of the DataFrame, Spark will split -this DataFrame into more partitions that there are nodes in the cluster. -In this case, each node of the cluster will hold more than 1 partition -of the total DataFrame. - -\section{\texorpdfstring{The \texttt{DataFrame} class in -\texttt{pyspark}}{The DataFrame class in pyspark}}\label{sec-dataframe-class} - -In \texttt{pyspark}, every Spark DataFrame is stored inside a python -object of class \texttt{pyspark.sql.dataframe.DataFrame}. Or more -succintly, a object of class \texttt{DataFrame}. - -Like any python class, the \texttt{DataFrame} class comes with multiple -methods that are available for every object of this class. This means -that you can use any of these methods in any Spark DataFrame that you -create through \texttt{pyspark}. - -As an example, in the code below I expose all the available methods from -this \texttt{DataFrame} class. First, I create a Spark DataFrame with -\texttt{spark.range(5)}, and, store it in the object \texttt{df5}. After -that, I use the \texttt{dir()} function to show all the methods that I -can use through this \texttt{df5} object: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df5 }\OperatorTok{=}\NormalTok{ spark.}\BuiltInTok{range}\NormalTok{(}\DecValTok{5}\NormalTok{)} -\NormalTok{available\_methods }\OperatorTok{=} \BuiltInTok{dir}\NormalTok{(df5)} -\BuiltInTok{print}\NormalTok{(available\_methods)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['__class__', '__delattr__', '__dict__', '__dir__', - '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', - '__getattribute__', '__getitem__', '__gt__', '__hash__', - '__init__', '__init_subclass__', '__le__', '__lt__', - '__module__', '__ne__', '__new__', '__reduce__', - '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', - '__str__', '__subclasshook__', '__weakref__', - '_collect_as_arrow', '_ipython_key_completions_', '_jcols', - '_jdf', '_jmap', '_joinAsOf', '_jseq', '_lazy_rdd', - '_repr_html_', '_sc', '_schema', '_session', '_sort_cols', - '_sql_ctx', '_support_repr_html', 'agg', 'alias', - 'approxQuantile', 'cache', 'checkpoint', 'coalesce', - 'colRegex', 'collect', 'columns', 'corr', 'count', 'cov', - 'createGlobalTempView', 'createOrReplaceGlobalTempView', - 'createOrReplaceTempView', 'createTempView', 'crossJoin', - 'crosstab', 'cube', 'describe', 'distinct', 'drop', - 'dropDuplicates', 'dropDuplicatesWithinWatermark', - 'drop_duplicates', 'dropna', 'dtypes', 'exceptAll', - 'explain', 'fillna', 'filter', 'first', 'foreach', - 'foreachPartition', 'freqItems', 'groupBy', 'groupby', - 'head', 'hint', 'id', 'inputFiles', 'intersect', - 'intersectAll', 'isEmpty', 'isLocal', 'isStreaming', - 'is_cached', 'join', 'limit', 'localCheckpoint', - 'mapInArrow', 'mapInPandas', 'melt', 'na', 'observe', - 'offset', 'orderBy', 'pandas_api', 'persist', 'printSchema', - 'randomSplit', 'rdd', 'registerTempTable', 'repartition', - 'repartitionByRange', 'replace', 'rollup', 'sameSemantics', - 'sample', 'sampleBy', 'schema', 'select', 'selectExpr', - 'semanticHash', 'show', 'sort', 'sortWithinPartitions', - 'sparkSession', 'sql_ctx', 'stat', 'storageLevel', - 'subtract', 'summary', 'tail', 'take', 'to', 'toDF', - 'toJSON', 'toLocalIterator', 'toPandas', 'to_koalas', - 'to_pandas_on_spark', 'transform', 'union', 'unionAll', - 'unionByName', 'unpersist', 'unpivot', 'where', 'withColumn', - 'withColumnRenamed', 'withColumns', 'withColumnsRenamed', - 'withMetadata', 'withWatermark', 'write', 'writeStream', - 'writeTo'] -\end{verbatim} - -All the methods present in this \texttt{DataFrame} class, are commonly -referred as the \emph{DataFrame API of Spark}. Remember, this is the -most important API of Spark. Because much of your Spark applications -will heavily use this API to compose your data transformations and data -flows (Chambers and Zaharia 2018). - -\section{Building a Spark DataFrame}\label{sec-building-a-dataframe} - -There are some different methods to create a Spark DataFrame. For -example, because a DataFrame is basically a Dataset of rows, we can -build a DataFrame from a collection of \texttt{Row}'s, through the -\texttt{createDataFrame()} method from your Spark Session: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ date} -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ Row} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{1}\NormalTok{, value }\OperatorTok{=} \FloatTok{28.3}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{1}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{2}\NormalTok{, value }\OperatorTok{=} \FloatTok{15.8}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{1}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{3}\NormalTok{, value }\OperatorTok{=} \FloatTok{20.1}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{4}\NormalTok{, value }\OperatorTok{=} \FloatTok{12.6}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{3}\NormalTok{))} -\NormalTok{]} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\end{Highlighting} -\end{Shaded} - -Remember that a Spark DataFrame in python is a object of class -\texttt{pyspark.sql.dataframe.DataFrame} as you can see below: - -\begin{Shaded} -\begin{Highlighting}[] -\BuiltInTok{type}\NormalTok{(df)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -pyspark.sql.dataframe.DataFrame -\end{verbatim} - -If you try to see what is inside of this kind of object, you will get a -small description of the columns present in the DataFrame as a result: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -DataFrame[id: bigint, value: double, date: date] -\end{verbatim} - -So, in the above example, we use the \texttt{Row()} constructor (from -\texttt{pyspark.sql} module) to build 4 rows. The -\texttt{createDataFrame()} method, stack these 4 rows together to form -our new DataFrame \texttt{df}. The result is a Spark DataFrame with 4 -rows and 3 columns (\texttt{id}, \texttt{value} and \texttt{date}). - -But you can use different methods to create the same Spark DataFrame. As -another example, with the code below, we are creating a DataFrame called -\texttt{students} from two different python lists (\texttt{data} and -\texttt{columns}). - -The first list (\texttt{data}) is a list of rows. Each row is represent -by a python tuple, which contains the values in each column. But the -secont list (\texttt{columns}) contains the names for each column in the -DataFrame. - -To create the \texttt{students} DataFrame we deliver these two lists to -\texttt{createDataFrame()} method: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{12114}\NormalTok{, }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{, }\DecValTok{21}\NormalTok{, }\FloatTok{1.56}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{9}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{9}\NormalTok{, }\StringTok{\textquotesingle{}Economics\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}SC\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{13007}\NormalTok{, }\StringTok{\textquotesingle{}Adrian\textquotesingle{}}\NormalTok{, }\DecValTok{23}\NormalTok{, }\FloatTok{1.82}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{7}\NormalTok{, }\StringTok{\textquotesingle{}Economics\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}SC\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{10045}\NormalTok{, }\StringTok{\textquotesingle{}George\textquotesingle{}}\NormalTok{, }\DecValTok{29}\NormalTok{, }\FloatTok{1.77}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{9}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{7}\NormalTok{, }\StringTok{\textquotesingle{}Law\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}SC\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{12459}\NormalTok{, }\StringTok{\textquotesingle{}Adeline\textquotesingle{}}\NormalTok{, }\DecValTok{26}\NormalTok{, }\FloatTok{1.61}\NormalTok{, }\DecValTok{8}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{7}\NormalTok{, }\StringTok{\textquotesingle{}Law\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}SC\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{10190}\NormalTok{, }\StringTok{\textquotesingle{}Mayla\textquotesingle{}}\NormalTok{, }\DecValTok{22}\NormalTok{, }\FloatTok{1.67}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{9}\NormalTok{, }\StringTok{\textquotesingle{}Design\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}AR\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{11552}\NormalTok{, }\StringTok{\textquotesingle{}Daniel\textquotesingle{}}\NormalTok{, }\DecValTok{24}\NormalTok{, }\FloatTok{1.75}\NormalTok{, }\DecValTok{9}\NormalTok{, }\DecValTok{9}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{9}\NormalTok{, }\StringTok{\textquotesingle{}Design\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}AR\textquotesingle{}}\NormalTok{)} -\NormalTok{]} - -\NormalTok{columns }\OperatorTok{=}\NormalTok{ [} - \StringTok{\textquotesingle{}StudentID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Age\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Height\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Score1\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}Score2\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Score3\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Score4\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Course\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Department\textquotesingle{}} -\NormalTok{]} - -\NormalTok{students }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, columns)} -\NormalTok{students} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -DataFrame[StudentID: bigint, Name: string, Age: bigint, Height: d -ouble, Score1: bigint, Score2: bigint, Score3: bigint, Score4: bi -gint, Course: string, Department: string] -\end{verbatim} - -You can also use a method that returns a \texttt{DataFrame} object by -default. Examples are the \texttt{table()} and \texttt{range()} methods -from your Spark Session, like we used in the -Section~\ref{sec-dataframe-class}, to create the \texttt{df5} object. - -Other examples are the methods used to read data and import it to -\texttt{pyspark}. These methods are available in the \texttt{spark.read} -module, like \texttt{spark.read.csv()} and \texttt{spark.read.json()}. -These methods will be described in more depth in -Chapter~\ref{sec-import}. - -\section{Visualizing a Spark DataFrame}\label{sec-viewing-a-dataframe} - -A key aspect of Spark is its laziness. In other words, for most -operations, Spark will only check if your code is correct and if it -makes sense. Spark will not actually run or execute the operations you -are describing in your code, unless you explicit ask for it with a -trigger operation, which is called an ``action'' (this kind of operation -is described in Section~\ref{sec-dataframe-actions}). - -You can notice this laziness in the output below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{students} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -DataFrame[StudentID: bigint, Name: string, Age: bigint, Height: d -ouble, Score1: bigint, Score2: bigint, Score3: bigint, Score4: bi -gint, Course: string, Department: string] -\end{verbatim} - -Because when we call for an object that stores a Spark DataFrame (like -\texttt{df} and \texttt{students}), Spark will only calculate and print -a summary of the structure of your Spark DataFrame, and not the -DataFrame itself. - -So how can we actually see our DataFrame? How can we visualize the rows -and values that are stored inside of it? For this, we use the -\texttt{show()} method. With this method, Spark will print the table as -pure text, as you can see in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{students.show()} -\end{Highlighting} -\end{Shaded} - - -\begin{verbatim} -+---------+-------+---+------+------+------+------+------+ -|StudentID| Name|Age|Height|Score1|Score2|Score3|Score4| -+---------+-------+---+------+------+------+------+------+ -| 12114| Anne| 21| 1.56| 8| 9| 10| 9| -| 13007| Adrian| 23| 1.82| 6| 6| 8| 7| -| 10045| George| 29| 1.77| 10| 9| 10| 7| -| 12459|Adeline| 26| 1.61| 8| 6| 7| 7| -| 10190| Mayla| 22| 1.67| 7| 7| 7| 9| -| 11552| Daniel| 24| 1.75| 9| 9| 10| 9| -+---------+-------+---+------+------+------+------+------+ -... with 2 more columns: Course, Department -\end{verbatim} - -By default, this method shows only the top rows of your DataFrame, but -you can specify how much rows exactly you want to see, by using -\texttt{show(n)}, where \texttt{n} is the number of rows. For example, I -can visualize only the first 2 rows of \texttt{df} like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.show(}\DecValTok{2}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+----------+ -| id|value| date| -+---+-----+----------+ -| 1| 28.3|2021-01-01| -| 2| 15.8|2021-01-01| -+---+-----+----------+ -only showing top 2 rows -\end{verbatim} - -\section{Getting the name of the -columns}\label{getting-the-name-of-the-columns} - -If you need to, you can easily collect a python list with the column -names present in your DataFrame, in the same way you would do in a -\texttt{pandas} DataFrame. That is, by using the \texttt{columns} method -of your DataFrame, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{students.columns} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['StudentID', - 'Name', - 'Age', - 'Height', - 'Score1', - 'Score2', - 'Score3', - 'Score4', - 'Course', - 'Department'] -\end{verbatim} - -\section{Getting the number of rows}\label{getting-the-number-of-rows} - -If you want to know the number of rows present in a Spark DataFrame, -just use the \texttt{count()} method of this DataFrame. As a result, -Spark will build this DataFrame, and count the number of rows present in -it. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{students.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -6 -\end{verbatim} - -\section{Spark Data Types}\label{spark-data-types} - -Each column of your Spark DataFrame is associated with a specific data -type. Spark supports a large number of different data types. You can see -the full list at the official documentation page\footnote{The full list - is available at the link - \url{https://spark.apache.org/docs/3.3.0/sql-ref-datatypes.html\#supported-data-types}}. -For now, we will focus on the most used data types, which are listed -below: - -\begin{itemize} -\tightlist -\item - \texttt{IntegerType}: Represents 4-byte signed integer numbers. The - range of numbers that it can represent is from -2147483648 to - 2147483647. -\item - \texttt{LongType}: Represents 8-byte signed integer numbers. The range - of numbers that it can represent is from -9223372036854775808 to - 9223372036854775807. -\item - \texttt{FloatType}: Represents 4-byte single-precision floating point - numbers. -\item - \texttt{DoubleType}: Represents 8-byte double-precision floating point - numbers. -\item - \texttt{StringType}: Represents character string values. -\item - \texttt{BooleanType}: Represents boolean values (true or false). -\item - \texttt{TimestampType}: Represents datetime values, i.e.~values that - contains fields year, month, day, hour, minute, and second, with the - session local time-zone. The timestamp value represents an absolute - point in time. -\item - \texttt{DateType}: Represents date values, i.e.~values that contains - fields year, month and day, without a time-zone. -\end{itemize} - -Besides these more ``standard'' data types, Spark supports two other -complex types, which are \texttt{ArrayType} and \texttt{MapType}: - -\begin{itemize} -\item - \texttt{ArrayType(elementType,\ containsNull)}: Represents a sequence - of elements with the type of \texttt{elementType}. - \texttt{containsNull} is used to indicate if elements in a - \texttt{ArrayType} value can have \texttt{null} values. -\item - \texttt{MapType(keyType,\ valueType,\ valueContainsNull)}: Represents - a set of key-value pairs. The data type of keys is described by - \texttt{keyType} and the data type of values is described by - \texttt{valueType}. For a \texttt{MapType} value, keys are not allowed - to have \texttt{null} values. \texttt{valueContainsNull} is used to - indicate if values of a \texttt{MapType} value can have \texttt{null} - values. -\end{itemize} - -Each one of these Spark data types have a corresponding python class in -\texttt{pyspark}, which are stored in the \texttt{pyspark.sql.types} -module. As a result, to access, lets say, type \texttt{StryngType}, we -can do this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StringType} -\NormalTok{s }\OperatorTok{=}\NormalTok{ StringType()} -\BuiltInTok{print}\NormalTok{(s)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -StringType() -\end{verbatim} - -\section{The DataFrame Schema}\label{sec-dataframe-schema} - -The schema of a Spark DataFrame is the combination of column names and -the data types associated with each of these columns. Schemas can be set -explicitly by you (that is, you can tell Spark how the schema of your -DataFrame should look like), or, they can be automatically defined by -Spark while reading or creating your data. - -You can get a succinct description of a DataFrame schema, by looking -inside the object where this DataFrame is stored. For example, lets look -again to the \texttt{df} DataFrame. - -In the result below, we can see that \texttt{df} has three columns -(\texttt{id}, \texttt{value} and \texttt{date}). By the description -\texttt{id:\ bigint}, we know that \texttt{id} is a column of type -\texttt{bigint}, which translates to the \texttt{LongType()} of Spark. -Furthermore, by the descriptions \texttt{value:\ double} and -\texttt{date:\ date}, we know too that the columns \texttt{value} and -\texttt{date} are of type \texttt{DoubleType()} and \texttt{DateType()}, -respectively. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -DataFrame[id: bigint, value: double, date: date] -\end{verbatim} - -You can also visualize a more complete report of the DataFrame schema by -using the \texttt{printSchema()} method, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- id: long (nullable = true) - |-- value: double (nullable = true) - |-- date: date (nullable = true) -\end{verbatim} - -\subsection{Accessing the DataFrame -schema}\label{accessing-the-dataframe-schema} - -So, by calling the object of your DataFrame (i.e.~an object of class -\texttt{DataFrame}) you can see a small description of the schema of -this DataFrame. But, how can you access this schema programmatically? - -You do this, by using the \texttt{schema} method of your DataFrame, like -in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.schema} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -StructType([StructField('id', LongType(), True), - StructField('value', DoubleType(), True), StructField('date', - DateType(), True)]) -\end{verbatim} - -The result of the \texttt{schema} method, is a \texttt{StructType()} -object, that contains some information about each column of your -DataFrame. More specifically, a \texttt{StructType()} object is filled -with multiple \texttt{StructField()} objects. Each -\texttt{StructField()} object stores the name and the type of a column, -and a boolean value (\texttt{True} or \texttt{False}) that indicates if -this column can contain any null value inside of it. - -You can use a \texttt{for} loop to iterate through this -\texttt{StructType()} and get the information about each column -separately. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{schema }\OperatorTok{=}\NormalTok{ df.schema} -\ControlFlowTok{for}\NormalTok{ column }\KeywordTok{in}\NormalTok{ schema:} - \BuiltInTok{print}\NormalTok{(column)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -StructField('id', LongType(), True) -StructField('value', DoubleType(), True) -StructField('date', DateType(), True) -\end{verbatim} - -You can access just the data type of each column by using the -\texttt{dataType} method of each \texttt{StructField()} object. - -\begin{Shaded} -\begin{Highlighting}[] -\ControlFlowTok{for}\NormalTok{ column }\KeywordTok{in}\NormalTok{ schema:} -\NormalTok{ datatype }\OperatorTok{=}\NormalTok{ column.dataType} - \BuiltInTok{print}\NormalTok{(datatype)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -LongType() -DoubleType() -DateType() -\end{verbatim} - -And you can do the same for column names and the boolean value (that -indicates if the column can contain ``null'' values), by using the -\texttt{name} and \texttt{nullable} methods, respectively. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Accessing the name of each column} -\ControlFlowTok{for}\NormalTok{ column }\KeywordTok{in}\NormalTok{ schema:} - \BuiltInTok{print}\NormalTok{(column.name)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -id -value -date -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Accessing the boolean value that indicates} -\CommentTok{\# if the column can contain null values} -\ControlFlowTok{for}\NormalTok{ column }\KeywordTok{in}\NormalTok{ schema:} - \BuiltInTok{print}\NormalTok{(column.nullable)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -True -True -True -\end{verbatim} - -\subsection{Building a DataFrame -schema}\label{building-a-dataframe-schema} - -When Spark creates a new DataFrame, it will automatically guess which -schema is appropriate for that DataFrame. In other words, Spark will try -to guess which are the appropriate data types for each column. But, this -is just a guess, and, sometimes, Spark go way off. - -Because of that, in some cases, you have to tell Spark how exactly you -want this DataFrame schema to be like. To do that, you need to build the -DataFrame schema by yourself, with \texttt{StructType()} and -\texttt{StructField()} constructors, alongside with the Spark data types -(i.e.~\texttt{StringType()}, \texttt{DoubleType()}, -\texttt{IntegerType()}, \ldots). Remember, all of these python classes -come from the \texttt{pyspark.sql.types} module. - -In the example below, the \texttt{schema} object represents the schema -of the \texttt{registers} DataFrame. This DataFrame have three columns -(\texttt{ID}, \texttt{Date}, \texttt{Name}) of types -\texttt{IntegerType}, \texttt{DateType} and \texttt{StringType}, -respectively. - -You can see below that I deliver this \texttt{schema} object that I -built to \texttt{spark.create-DataFrame()}. Now -\texttt{spark.createDataFrame()} will follow the schema I described in -this \texttt{schema} object when building the \texttt{registers} -DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StructType, StructField} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ DateType, StringType, IntegerType} -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ date} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{1}\NormalTok{, date(}\DecValTok{2022}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{1}\NormalTok{), }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, date(}\DecValTok{2022}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{3}\NormalTok{), }\StringTok{\textquotesingle{}Layla\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{3}\NormalTok{, date(}\DecValTok{2022}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{15}\NormalTok{), }\StringTok{\textquotesingle{}Wick\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{4}\NormalTok{, date(}\DecValTok{2022}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{11}\NormalTok{), }\StringTok{\textquotesingle{}Paul\textquotesingle{}}\NormalTok{)} -\NormalTok{]} - -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{, IntegerType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}Date\textquotesingle{}}\NormalTok{, DateType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{)} -\NormalTok{])} - -\NormalTok{registers }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, schema }\OperatorTok{=}\NormalTok{ schema)} -\end{Highlighting} -\end{Shaded} - -Having this example in mind, in order to build a DataFrame schema from -scratch, you have to build the equivalent \texttt{StructType()} object -that represents the schema you want. - -\subsection{Checking your DataFrame -schema}\label{checking-your-dataframe-schema} - -In some cases, you need to include in your \texttt{pyspark} program, -some checks that certifies that your Spark DataFrame have the expected -schema. In other words, you want to take actions if your DataFrame have -a different schema that might cause a problem in your program. - -To check if a specific column of your DataFrame is associated with the -data type \(x\), you have to use the DataFrame schema to check if the -respective column is an ``instance'' of the python class that represents -that data type \(x\). Lets use the \texttt{df} DataFrame as an example. - -Suppose you wanted to check if the \texttt{id} column is of type -\texttt{IntegerType}. To do this check, we use the python built-in -function \texttt{isinstance()} with the python class that represents the -Spark \texttt{IntegerType} data type. But, you can see in the result -below, that the \texttt{id} column is not of type \texttt{IntegerType}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ IntegerType} -\NormalTok{schema }\OperatorTok{=}\NormalTok{ df.schema} -\NormalTok{id\_column }\OperatorTok{=}\NormalTok{ schema[}\DecValTok{0}\NormalTok{]} -\BuiltInTok{isinstance}\NormalTok{(id\_column.dataType, IntegerType)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -False -\end{verbatim} - -This unexpected result happens, because the \texttt{id} column is -actually from the ``big integer'' type, or, the \texttt{LongType} (which -are 8-byte signed integer). You can see below, that now the test results -in true: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ LongType} -\BuiltInTok{isinstance}\NormalTok{(id\_column.dataType, LongType)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -True -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{\texorpdfstring{Introducing the \texttt{Column} -class}{Introducing the Column class}}\label{introducing-the-column-class} - -As we described at the introduction of -Chapter~\ref{sec-dataframes-chapter}, you will massively use the methods -from the \texttt{DataFrame} class in your Spark applications to manage, -modify and calculate your Spark DataFrames. - -However, there is one more python class that provides some very useful -methods that you will regularly use, which is the \texttt{Column} class, -or more specifically, the \texttt{pyspark.sql.column.Column} class. - -The \texttt{Column} class is used to represent a column in a Spark -DataFrame. This means that, each column of your Spark DataFrame is a -object of class \texttt{Column}. - -We can confirm this statement, by taking the \texttt{df} DataFrame that -we showed at Section~\ref{sec-building-a-dataframe}, and look at the -class of any column of it. Like the \texttt{id} column: - -\begin{Shaded} -\begin{Highlighting}[] -\BuiltInTok{type}\NormalTok{(df.}\BuiltInTok{id}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -pyspark.sql.column.Column -\end{verbatim} - -\section{Building a column object}\label{building-a-column-object} - -You can refer to or create a column, by using the \texttt{col()} and -\texttt{column()} functions from \texttt{pyspark.sql.functions} module. -These functions receive a string input with the name of the column you -want to create/refer to. - -Their result are always a object of class \texttt{Column}. For example, -the code below creates a column called \texttt{ID}: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{id\_column }\OperatorTok{=}\NormalTok{ col(}\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{)} -\BuiltInTok{print}\NormalTok{(id\_column)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Column<'ID'> -\end{verbatim} - -\section{Columns are strongly related to -expressions}\label{sec-columns-related-expressions} - -Many kinds of transformations that we want to apply over a Spark -DataFrame, are usually described through expressions, and, these -expressions in Spark are mainly composed by \textbf{column -transformations}. That is why the \texttt{Column} class, and its -methods, are so important in Apache Spark. - -Columns in Spark are so strongly related to expressions that the columns -themselves are initially interpreted as expressions. If we look again at -the column \texttt{id} from \texttt{df} DataFrame, Spark will bring a -expression as a result, and not the values hold by this column. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.}\BuiltInTok{id} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Column<'id'> -\end{verbatim} - -Having these ideas in mind, when I created the column \texttt{ID} on the -previous section, I created a ``column expression''. This means that -\texttt{col("ID")} is just an expression, and as consequence, Spark does -not know which are the values of column \texttt{ID}, or, where it lives -(which is the DataFrame that this column belongs?). For now, Spark is -not interested in this information, it just knows that we have an -expression referring to a column called \texttt{ID}. - -These ideas relates a lot to the \textbf{lazy aspect} of Spark that we -talked about in Section~\ref{sec-viewing-a-dataframe}. Spark will not -perform any heavy calculation, or show you the actual results/values -from you code, until you trigger the calculations with an action (we -will talk more about these ``actions'' on -Section~\ref{sec-dataframe-actions}). As a result, when you access a -column, Spark will only deliver a expression that represents that -column, and not the actual values of that column. - -This is handy, because we can store our expressions in variables, and, -reuse it latter, in multiple parts of our code. For example, I can keep -building and merging a column with different kinds of operators, to -build a more complex expression. In the example below, I create a -expression that doubles the values of \texttt{ID} column: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{expr1 }\OperatorTok{=}\NormalTok{ id\_column }\OperatorTok{*} \DecValTok{2} -\BuiltInTok{print}\NormalTok{(expr1)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Column<'(ID * 2)'> -\end{verbatim} - -Remember, with this expression, Spark knows that we want to get a column -called \texttt{ID} somewhere, and double its values. But Spark will not -perform that action right now. - -Logical expressions follow the same logic. In the example below, I am -looking for rows where the value in column \texttt{Name} is equal to -\texttt{\textquotesingle{}Anne\textquotesingle{}}, and, the value in -column \texttt{Grade} is above 6. - -Again, Spark just checks if this is a valid logical expression. For now, -Spark does not want to know where are these \texttt{Name} and -\texttt{Grade} columns. Spark does not evaluate the expression, until we -ask for it with an action: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{expr2 }\OperatorTok{=}\NormalTok{ (col(}\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{) }\OperatorTok{\&}\NormalTok{ (col(}\StringTok{\textquotesingle{}Grade\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \DecValTok{6}\NormalTok{)} -\BuiltInTok{print}\NormalTok{(expr2)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Column<'((Name = Anne) AND (Grade > 6))'> -\end{verbatim} - -\section{Literal values versus -expressions}\label{literal-values-versus-expressions} - -We know now that columns of a Spark DataFrame have a deep connection -with expressions. But, on the other hand, there are some situations that -you write a value (it can be a string, a integer, a boolean, or -anything) inside your \texttt{pyspark} code, and you might actually want -Spark to intepret this value as a constant (or a literal) value, rather -than a expression. - -As an example, lets suppose you control the data generated by the sales -of five different stores, scattered across different regions of Belo -Horizonte city (in Brazil). Now, lets suppose you receive a batch of -data generated by the 4th store in the city, which is located at -Amazonas Avenue, 324. This batch of data is exposed below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{path }\OperatorTok{=} \StringTok{\textquotesingle{}./../Data/sales.json\textquotesingle{}} -\NormalTok{sales }\OperatorTok{=}\NormalTok{ spark.read.json(path)} -\NormalTok{sales.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+----------+------------+-------+-------------------+-----+ -|price|product_id|product_name|sale_id| timestamp|units| -+-----+----------+------------+-------+-------------------+-----+ -| 3.12| 134| Milk 1L Mua| 328711|2022-02-01T22:10:02| 1| -| 1.22| 110| Coke 350ml| 328712|2022-02-03T11:42:09| 3| -| 4.65| 117| Pepsi 2L| 328713|2022-02-03T14:22:15| 1| -| 1.22| 110| Coke 350ml| 328714|2022-02-03T18:33:08| 1| -| 0.85| 341|Trident Mint| 328715|2022-02-04T15:41:36| 1| -+-----+----------+------------+-------+-------------------+-----+ -\end{verbatim} - -If you look at this batch\ldots{} there is no indication that these -sales come from the 4th store. In other words, this information is not -present in the data, is just in your mind. It certainly is a very bad -idea to leave this data as is, whithout any identification of the source -of it. So, you might want to add some labels and new columns to this -batch of data, that can easily identify the store that originated these -sales. - -For example, we could add two new columns to this \texttt{sales} -DataFrame. One for the number that identifies the store (4), and, -another to keep the store address. Considering that all rows in this -batch comes from the 4th store, we should add two ``constant'' columns, -meaning that these columns should have a constant value across all rows -in this batch. But, how can we do this? How can we create a ``constant'' -column? The answer is: by forcing Spark to interpret the values as -literal values, instead of a expression. - -In other words, I can not use the \texttt{col()} function to create -these two new columns. Because this \texttt{col()} function receives a -column name as input. \textbf{It interprets our input as an expression -that refers to a column name}. This function does not accept some sort -of description of the actual values that this column should store. - -\section{Passing a literal (or a constant) value to -Spark}\label{sec-literal-values} - -So how do we force Spark to interpret a value as a literal (or constant) -value, rather than a expression? To do this, you must write this value -inside the \texttt{lit()} (short for ``literal'') function from the -\texttt{pyspark.sql.functions} module. - -In other words, when you write in your code the statement -\texttt{lit(4)}, Spark understand that you want to create a new column -which is filled with 4's. In other words, this new column is filled with -the constant integer 4. - -With the code below, I am creating two new columns (called -\texttt{store\_number} and \texttt{store\_address}), and adding them to -the \texttt{sales} DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ lit} -\NormalTok{store\_number }\OperatorTok{=}\NormalTok{ lit(}\DecValTok{4}\NormalTok{).alias(}\StringTok{\textquotesingle{}store\_number\textquotesingle{}}\NormalTok{)} -\NormalTok{store\_address }\OperatorTok{=}\NormalTok{ lit(}\StringTok{\textquotesingle{}Amazonas Avenue, 324\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}store\_address\textquotesingle{}}\NormalTok{)} - -\NormalTok{sales }\OperatorTok{=}\NormalTok{ sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}*\textquotesingle{}}\NormalTok{, store\_number, store\_address} -\NormalTok{ )} - -\NormalTok{sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}product\_id\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}product\_name\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}store\_number\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}store\_address\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+------------+------------+--------------------+ -|product_id|product_name|store_number| store_address| -+----------+------------+------------+--------------------+ -| 134| Milk 1L Mua| 4|Amazonas Avenue, 324| -| 110| Coke 350ml| 4|Amazonas Avenue, 324| -| 117| Pepsi 2L| 4|Amazonas Avenue, 324| -| 110| Coke 350ml| 4|Amazonas Avenue, 324| -| 341|Trident Mint| 4|Amazonas Avenue, 324| -+----------+------------+------------+--------------------+ -\end{verbatim} - -In essence, you normally use the \texttt{lit()} function when you want -to write a literal value in places where Spark expects a column name. In -the example above, instead of writing a name to an existing column in -the \texttt{sales} DataFrame, I wanted to write the literal values -\texttt{\textquotesingle{}Amazonas\ Avenue,\ 324\textquotesingle{}} and -\texttt{4}, and I used the \texttt{lit()} function to make this -intention very clear to Spark. If I did not used the \texttt{lit()} -function, the \texttt{withColumn()} method would interpret the value -\texttt{\textquotesingle{}Amazonas\ Avenue,\ 324\textquotesingle{}} as -an existing column named \texttt{Amazonas\ Avenue,\ 324}. - -\section{\texorpdfstring{Key methods of the \texttt{Column} -class}{Key methods of the Column class}}\label{key-methods-of-the-column-class} - -Because many transformations that we want to apply over our DataFrames -are expressed as column transformations, the methods from the -\texttt{Column} class will be quite useful on many different contexts. -You will see many of these methods across the next chapters, like -\texttt{desc()}, \texttt{alias()} and \texttt{cast()}. - -Remember, you can always use the \texttt{dir()} function to see the -complete list of methods available in any python class. Is always useful -to check the official documentation too\footnote{\url{https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.Column.html}}. -There you will have a more complete description of each method. - -But since they are so important in Spark, lets just give you a brief -overview of some of the most popular methods from the \texttt{Column} -class (these methods will be described in more detail in later -chapters): - -\begin{itemize} -\tightlist -\item - \texttt{desc()} and \texttt{asc()}: methods to order the values of the - column in a descending or ascending order (respectively); -\item - \texttt{cast()} and \texttt{astype()}: methods to cast (or convert) - the values of the column to a specific data type; -\item - \texttt{alias()}: method to rename a column; -\item - \texttt{substr()}: method that returns a new column with the sub - string of each value; -\item - \texttt{isNull()} and \texttt{isNotNull()}: logical methods to test if - each value in the column is a null value or not; -\item - \texttt{startswith()} and \texttt{endswith()}: logical methods to - search for values that starts with or ends with a specific pattern; -\item - \texttt{like()} and \texttt{rlike()}: logical methods to search for a - specific pattern or regular expression in the values of the column; -\item - \texttt{isin()}: logical method to test if each value in the column is - some of the listed values; -\end{itemize} - -\bookmarksetup{startatroot} - -\chapter{Transforming your Spark DataFrame - Part -1}\label{sec-transforming-dataframes-part1} - -Virtually every data analysis or data pipeline will include some ETL -(\emph{Extract, Transform, Load}) process, and the T is an essential -part of it. Because, you almost never have an input data, or a initial -DataFrame that perfectly fits your needs. - -This means that you always have to transform the initial data that you -have, to a specific format that you can use in your analysis. In this -chapter, you will learn how to apply some of these basic transformations -to your Spark DataFrame. - -\section{Defining -transformations}\label{sec-df-defining-transformations} - -Spark DataFrames are \textbf{immutable}, meaning that, they cannot be -directly changed. But you can use an existing DataFrame to create a new -one, based on a set of transformations. In other words, you define a new -DataFrame as a transformed version of an older DataFrame. - -Basically every \texttt{pyspark} program that you write will have such -transformations. Spark support many types of transformations, however, -in this chapter, we will focus on six basic transformations that you can -apply to a DataFrame: - -\begin{itemize} -\tightlist -\item - Filtering rows based on a logical condition; -\item - Selecting a subset of rows; -\item - Selecting specific columns; -\item - Adding or deleting columns; -\item - Sorting rows; -\item - Calculating aggregates; -\end{itemize} - -Therefore, when you apply one of the above transformations to an -existing DataFrame, you will get a new DataFrame as a result. You -usually combine multiple transformations together to get your desired -result. As a first example, lets get back to the \texttt{df} DataFrame: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ date} -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ Row} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{1}\NormalTok{, value }\OperatorTok{=} \FloatTok{28.3}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{1}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{2}\NormalTok{, value }\OperatorTok{=} \FloatTok{15.8}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{1}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{3}\NormalTok{, value }\OperatorTok{=} \FloatTok{20.1}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{2}\NormalTok{)),} -\NormalTok{ Row(}\BuiltInTok{id} \OperatorTok{=} \DecValTok{4}\NormalTok{, value }\OperatorTok{=} \FloatTok{12.6}\NormalTok{, date }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2021}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{3}\NormalTok{))} -\NormalTok{]} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\end{Highlighting} -\end{Shaded} - -In the example below, to create a new DataFrame called -\texttt{big\_values}, we begin with the \texttt{df} DataFrame, then, we -filter its rows where \texttt{value} is greater than 15, then, we select -\texttt{date} and \texttt{value} columns, then, we sort the rows based -on the \texttt{value} column. So, this set of sequential transformations -(filter it, then, select it, then, order it, \ldots) defines what this -new \texttt{big\_values} DataFrame is. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\CommentTok{\# You define a chain of transformations to} -\CommentTok{\# create a new DataFrame} -\NormalTok{big\_values }\OperatorTok{=}\NormalTok{ df}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \DecValTok{15}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Thus, to apply a transformation to an existing DataFrame, we use -DataFrame methods such as \texttt{select()}, \texttt{filter()}, -\texttt{orderBy()} and many others. Remember, these are methods from the -python class that defines Spark DataFrame's (i.e.~the -\texttt{pyspark.sql.dataframe.DataFrame} class). - -This means that you can apply these transformations only to Spark -DataFrames, and no other kind of python object. For example, if you try -to use the \texttt{orderBy()} method in a standard python string -(i.e.~an object of class \texttt{str}), you will get an -\texttt{AttributeError} error. Because this class of object in python, -does not have a \texttt{orderBy()} method: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{s }\OperatorTok{=} \StringTok{"A python string"} -\NormalTok{s.orderBy(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Traceback (most recent call last): - File "", line 1, in -AttributeError: 'str' object has no attribute 'orderBy' -\end{verbatim} - -Each one of these DataFrame methods create a \emph{lazily evaluated -transformation}. Once again, we see the \textbf{lazy} aspect of Spark -doing its work here. All these transformation methods are lazily -evaluated, meaning that, Spark will only check if they make sense with -the initial DataFrame that you have. Spark will not actually perform -these transformations on your initial DataFrame, not untill you trigger -these transformations with an \textbf{action}. - -\section{Triggering calculations with -actions}\label{sec-dataframe-actions} - -Therefore, Spark will avoid performing any heavy calculation until such -calculation is really needed. But how or when Spark will face this -decision? \textbf{When it encounters an action}. An action is the tool -you have to trigger Spark to actually perform the transformations you -have defined. - -\begin{quote} -An action instructs Spark to compute the result from a series of -transformations. (Chambers and Zaharia 2018). -\end{quote} - -There are four kinds of actions in Spark: - -\begin{itemize} -\tightlist -\item - Showing an output in the console; -\item - Writing data to some file or data source; -\item - Collecting data from a Spark DataFrame to native objects in python (or - Java, Scala, R, etc.); -\item - Counting the number of rows in a Spark DataFrame; -\end{itemize} - -You already know the first type of action, because we used it before -with the \texttt{show()} method. This \texttt{show()} method is an -action by itself, because you are asking Spark to show some output to -you. So we can make Spark to actually calculate the transformations that -defines the \texttt{big\_values} DataFrame, by asking Spark to show this -DataFrame to us. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{big\_values.show()} -\end{Highlighting} -\end{Shaded} - - -\begin{verbatim} -+----------+-----+ -| date|value| -+----------+-----+ -|2021-01-01| 15.8| -|2021-01-02| 20.1| -|2021-01-01| 28.3| -+----------+-----+ -\end{verbatim} - -Another very useful action is the \texttt{count()} method, that gives -you the number of rows in a DataFrame. To be able to count the number of -rows in a DataFrame, Spark needs to access this DataFrame in the first -place. That is why this \texttt{count()} method behaves as an action. -Spark will perform the transformations that defines \texttt{big\_values} -to access the actual rows of this DataFrame and count them. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{big\_values.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -3 -\end{verbatim} - -Furthermore, sometimes, you want to collect the data of a Spark -DataFrame to use it inside python. In other words, sometimes you need to -do some work that Spark cannot do by itself. To do so, you collect part -of the data that is being generated by Spark, and store it inside a -normal python object to use it in a standard python program. - -That is what the \texttt{collect()} method do. It transfers all the data -of your Spark DataFrame into a standard python list that you can easily -access with python. More specifically, you get a python list full of -\texttt{Row()} values: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ big\_values.collect()} -\BuiltInTok{print}\NormalTok{(data)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[Row(date=datetime.date(2021, 1, 1), value=15.8), - Row(date=datetime.date(2021, 1, 2), value=20.1), - Row(date=datetime.date(2021, 1, 1), value=28.3)] -\end{verbatim} - -The \texttt{take()} method is very similar to \texttt{collect()}. But -you usually apply \texttt{take()} when you need to collect just a small -section of your DataFrame (and not the entire thing), like the first -\texttt{n} rows. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{n }\OperatorTok{=} \DecValTok{1} -\NormalTok{first\_row }\OperatorTok{=}\NormalTok{ big\_values.take(n)} -\BuiltInTok{print}\NormalTok{(first\_row)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[Row(date=datetime.date(2021, 1, 1), value=15.8)] -\end{verbatim} - -The last action would be the \texttt{write} method of a Spark DataFrame, -but we will explain this method latter at Chapter~\ref{sec-import}. - -\section{Understanding narrow and wide -transformations}\label{sec-narrow-wide} - -There are two kinds of transformations in Spark: narrow and wide -transformations. Remember, a Spark DataFrame is divided into many small -parts (called partitions), and, these parts are spread across the -cluster. The basic difference between narrow and wide transformations, -is if the transformation forces Spark to read data from multiple -partitions to generate a single part of the result of that -transformation, or not. - -More technically, narrow transformations are simply transformations -where 1 input data (or 1 partition of the input DataFrame) contributes -to only 1 partition of the output. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/../Figures/narrow-transformations.png} - -} - -\caption{\label{fig-narrow-transformations}Presenting narrow -transformations} - -\end{figure}% - -In other words, each partition of your input DataFrame will be used -(\emph{separately}) to generate one individual part of the result of -your transformation. As another perspective, you can understand narrow -transformations as those where Spark does not need to read the entire -input DataFrame to generate a single and small piece of your result. - -A classic example of narrow transformation is a filter. For example, -suppose you have three students (Anne, Carls and Mike), and that each -one has a bag full of blue, orange and red balls mixed. Now, suppose you -asked them to collect all the red balls of these bags, and combined them -in a single bag. - -To do this task, Mike does not need to know what balls are inside of the -bag of Carls or Anne. He just need to collect the red balls that are -solely on his bag. At the end of the task, each student will have a part -of the end result (that is, all the red balls that were in his own bag), -and they just need to combine all these parts to get the total result. - -The same thing applies to filters in Spark DataFrames. When you filter -all the rows where the column \texttt{state} is equal to -\texttt{"Alaska"}, Spark will filter all the rows in each partition -separately, and then, will combine all the outputs to get the final -result. - -In contrast, wide transformations are the opposite of that. In wide -transformations, Spark needs to use more than 1 partition of the input -DataFrame to generate a small piece of the result. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/../Figures/wide-transformations.png} - -} - -\caption{\label{fig-wide-transformations}Presenting wide -transformations} - -\end{figure}% - -When this kind of transformation happens, each worker node of the -cluster needs to share his partition with the others. In other words, -what happens is a partition shuffle. Each worker node sends his -partition to the others, so they can have access to it, while performing -their assigned tasks. - -Partition shuffles are a very popular topic in Apache Spark, because -they can be a serious source of inefficiency in your Spark application -(Chambers and Zaharia 2018). In more details, when these shuffles -happens, Spark needs to write data back to the hard disk of the -computer, and this is not a very fast operation. It does not mean that -wide transformations are bad or slow, just that the shuffles they are -producing can be a problem. - -A classic example of wide operation is a grouped aggregation. For -example, lets suppose we had a DataFrame with the daily sales of -multiple stores spread across the country, and, we wanted to calculate -the total sales per city/region. To calculate the total sales of a -specific city, like ``São Paulo'', Spark would need to find all the rows -that corresponds to this city, before adding the values, and these rows -can be spread across multiple partitions of the cluster. - -\section{\texorpdfstring{The \texttt{transf} -DataFrame}{The transf DataFrame}}\label{sec-transf-dataframe} - -To demonstrate some of the next examples in this chapter, we will use a -different DataFrame called \texttt{transf}. The data that represents -this DataFrame is freely available as a CSV file. You can download this -CSV at the repository of this book\footnote{\url{https://github.com/pedropark99/Introd-pyspark/tree/main/Data}}. - -With the code below, you can import the data from the -\texttt{transf.csv} CSV file, to recreate the \texttt{transf} DataFrame -in your Spark Session: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StructType, StructField} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ DoubleType, StringType} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ LongType, TimestampType, DateType} -\NormalTok{path }\OperatorTok{=} \StringTok{"../Data/transf.csv"} -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, DateType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{, TimestampType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{, LongType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{, DoubleType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}transferCurrency\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{, LongType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}transferLog\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}destinationBankNumber\textquotesingle{}}\NormalTok{, LongType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}destinationBankBranch\textquotesingle{}}\NormalTok{, LongType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}destinationBankAccount\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{)} -\NormalTok{])} - -\NormalTok{transf }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(path, schema }\OperatorTok{=}\NormalTok{ schema, sep }\OperatorTok{=} \StringTok{";"}\NormalTok{, header }\OperatorTok{=} \VariableTok{True}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -You could also use the \texttt{pandas} library to read the DataFrame -directly from GitHub, without having to manually download the file: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ pandas }\ImportTok{as}\NormalTok{ pd} -\NormalTok{url }\OperatorTok{=} \StringTok{\textquotesingle{}https://raw.githubusercontent.com/\textquotesingle{}} -\NormalTok{url }\OperatorTok{=}\NormalTok{ url }\OperatorTok{+} \StringTok{\textquotesingle{}pedropark99/Introd{-}pyspark/\textquotesingle{}} -\NormalTok{url }\OperatorTok{=}\NormalTok{ url }\OperatorTok{+} \StringTok{\textquotesingle{}main/Data/transf.csv\textquotesingle{}} - -\NormalTok{transf\_pd }\OperatorTok{=}\NormalTok{ pd.read\_csv(} -\NormalTok{ url, sep }\OperatorTok{=} \StringTok{\textquotesingle{};\textquotesingle{}}\NormalTok{,} -\NormalTok{ dtype }\OperatorTok{=} \BuiltInTok{str}\NormalTok{,} -\NormalTok{ keep\_default\_na }\OperatorTok{=} \VariableTok{False} -\NormalTok{)} - -\NormalTok{transf\_pd[}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ (} -\NormalTok{ transf\_pd[}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{].astype(}\StringTok{\textquotesingle{}float\textquotesingle{}}\NormalTok{)} -\NormalTok{)} - -\NormalTok{columns\_to\_int }\OperatorTok{=}\NormalTok{ [} - \StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}destinationBankNumber\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}destinationBankBranch\textquotesingle{}} -\NormalTok{]} -\ControlFlowTok{for}\NormalTok{ column }\KeywordTok{in}\NormalTok{ columns\_to\_int:} -\NormalTok{ transf\_pd[column] }\OperatorTok{=}\NormalTok{ transf\_pd[column].astype(}\StringTok{\textquotesingle{}int\textquotesingle{}}\NormalTok{)} - -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} -\NormalTok{transf }\OperatorTok{=}\NormalTok{ spark.createDataFrame(transf\_pd)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}timestamp\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -This \texttt{transf} DataFrame contains bank transfer records from a -fictitious bank. Before I show you the actual data of this DataFrame, is -useful to give you a quick description of each column that it contains: - -\begin{itemize} -\tightlist -\item - \texttt{dateTransfer}: the date when the transfer occurred; -\item - \texttt{datetimeTransfer}: the date and time when the transfer - occurred; -\item - \texttt{clientNumber} the unique number that identifies a client of - the bank; -\item - \texttt{transferValue}: the nominal value that was transferred; -\item - \texttt{transferCurrency}: the currency of the nominal value - transferred; -\item - \texttt{transferID}: an unique ID for the transfer; -\item - \texttt{transferLog}: store any error message that may have appeared - during the execution of the transfer; -\item - \texttt{destinationBankNumber}: the transfer destination bank number; -\item - \texttt{destinationBankBranch}: the transfer destination branch - number; -\item - \texttt{destinationBankAccount}: the transfer destination account - number; -\end{itemize} - -Now, to see the actual data of this DataFrame, we can use the -\texttt{show()} action as usual. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -As you can see below, this \texttt{transf} DataFrame have 2421 rows in -total: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -2421 -\end{verbatim} - -\section{Filtering rows of your -DataFrame}\label{filtering-rows-of-your-dataframe} - -To filter specific rows of a DataFrame, \texttt{pyspark} offers two -equivalent DataFrame methods called \texttt{where()} and -\texttt{filter()}. In other words, they both do the same thing, and work -in the same way. These methods receives as input a logical expression -that translates what you want to filter. - -As a first example, lets suppose you wanted to inspect all the rows from -the \texttt{transf} DataFrame where \texttt{transferValue} is less than -1000. To do so, you can use the following code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(}\StringTok{"transferValue \textless{} 1000"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-18|2022-12-18 08:45:30| 1297| 142.66| -| 2022-12-13|2022-12-13 20:44:23| 5516| 992.15| -| 2022-11-24|2022-11-24 20:01:39| 1945| 174.64| -| 2022-11-07|2022-11-07 16:35:57| 4862| 570.69| -| 2022-11-04|2022-11-04 20:00:34| 1297| 854.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -Writing simple SQL logical expression inside a string is the most easy -and ``clean'' way to create a filter expression in \texttt{pyspark}. -However, you could also write the same exact expression in a more -``pythonic'' way, using the \texttt{col()} function from -\texttt{pyspark.sql.functions} module. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{"transferValue"}\NormalTok{) }\OperatorTok{\textless{}} \DecValTok{1000}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-18|2022-12-18 08:45:30| 1297| 142.66| -| 2022-12-13|2022-12-13 20:44:23| 5516| 992.15| -| 2022-11-24|2022-11-24 20:01:39| 1945| 174.64| -| 2022-11-07|2022-11-07 16:35:57| 4862| 570.69| -| 2022-11-04|2022-11-04 20:00:34| 1297| 854.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -You still have a more verbose alternative, that does not require the -\texttt{col()} function. With this method, you refer to the specific -column using the dot operator (\texttt{.}), like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# This will give you the exact} -\CommentTok{\# same result of the examples above} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(transf.transferValue }\OperatorTok{\textless{}} \DecValTok{1000}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\subsection{Logical operators -available}\label{logical-operators-available} - -As we saw in the previous section, there are two ways to write logical -expressions in \texttt{pyspark}: 1) write a SQL logical expression -inside a string; 2) or, write a python logical expression using the -\texttt{col()} function. - -If you choose to write a SQL logical expressions in a string, you need -to use the logical operators of SQL in your expression (not the logical -operators of python). In the other hand, if you choose to write in the -``python'' way, then, you need to use the logical operators of python -instead. - -The logical operators of SQL are described in the table below: - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1124}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2472}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6404}}@{}} -\caption{List of logical operators of -SQL}\label{tbl-logical-operators-sql}\tabularnewline -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Operator -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of expression -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning of the expression -\end{minipage} \\ -\midrule\noalign{} -\endfirsthead -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Operator -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of expression -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning of the expression -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -\textless{} & \texttt{x\ \textless{}\ y} & is \texttt{x} less than -\texttt{y}? \\ -\textgreater{} & \texttt{x\ \textgreater{}\ y} & is \texttt{x} greater -than \texttt{y}? \\ -\textless= & \texttt{x\ \textless{}=\ y} & is \texttt{x} less than or -equal to \texttt{y}? \\ -\textgreater= & \texttt{x\ \textgreater{}=\ y} & is \texttt{x} greater -than or equal to \texttt{y}? \\ -== & \texttt{x\ ==\ y} & is \texttt{x} equal to \texttt{y}? \\ -!= & \texttt{x\ !=\ y} & is \texttt{x} not equal to \texttt{y}? \\ -in & \texttt{x\ in\ y} & is \texttt{x} one of the values listed in -\texttt{y}? \\ -and & \texttt{x\ and\ y} & both logical expressions \texttt{x} and -\texttt{y} are true? \\ -or & \texttt{x\ or\ y} & at least one of logical expressions \texttt{x} -and \texttt{y} are true? \\ -not & \texttt{not\ x} & is the logical expression \texttt{x} not -true? \\ -\end{longtable} - -And, the logical operators of python are described in the table below: - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.1124}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.2472}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6404}}@{}} -\caption{List of logical operators of -python}\label{tbl-logical-operators-python}\tabularnewline -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Operator -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of expression -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning of the expression -\end{minipage} \\ -\midrule\noalign{} -\endfirsthead -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Operator -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of expression -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning of the expression -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -\textless{} & \texttt{x\ \textless{}\ y} & is \texttt{x} less than -\texttt{y}? \\ -\textgreater{} & \texttt{x\ \textgreater{}\ y} & is \texttt{x} greater -than \texttt{y}? \\ -\textless= & \texttt{x\ \textless{}=\ y} & is \texttt{x} less than or -equal to \texttt{y}? \\ -\textgreater= & \texttt{x\ \textgreater{}=\ y} & is \texttt{x} greater -than or equal to \texttt{y}? \\ -== & \texttt{x\ ==\ y} & is \texttt{x} equal to \texttt{y}? \\ -!= & \texttt{x\ !=\ y} & is \texttt{x} not equal to \texttt{y}? \\ -\& & \texttt{x\ \&\ y} & both logical expressions \texttt{x} and -\texttt{y} are true? \\ -\textbar{} & \texttt{x\ \textbar{}\ y} & at least one of logical -expressions \texttt{x} and \texttt{y} are true? \\ -\textasciitilde{} & \texttt{\textasciitilde{}x} & is the logical -expression \texttt{x} not true? \\ -\end{longtable} - -\subsection{Connecting multiple logical -expressions}\label{connecting-multiple-logical-expressions} - -Sometimes, you need to write more complex logical expressions to -correctly describe the rows you are interested in. That is, when you -combine multiple logical expressions together. - -As an example, lets suppose you wanted all the rows in \texttt{transf} -DataFrame from client of number 1297 where the transfer value is smaller -than 1000, and the date of the transfer is after 20 of February 2022. -These conditions are dependent, that is, they are connected to each -other (the client number, the transfer value and the date of the -transfer). That is why I used the \texttt{and} keyword between each -condition in the example below (i.e.~to connect these three conditions -together). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{ transferValue \textless{} 1000} -\StringTok{ and clientNumber == 1297 } -\StringTok{ and dateTransfer \textgreater{} \textquotesingle{}2022{-}02{-}20\textquotesingle{}} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(condition)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-18|2022-12-18 08:45:30| 1297| 142.66| -| 2022-11-04|2022-11-04 20:00:34| 1297| 854.0| -| 2022-02-27|2022-02-27 13:27:44| 1297| 697.21| -+------------+-------------------+------------+-------------+ -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -I could translate this logical expression into the ``pythonic'' way -(using the \texttt{col()} function). However, I would have to surround -each individual expression by parentheses, and, use the \texttt{\&} -operator to substitute the \texttt{and} keyword. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(} -\NormalTok{ (col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{\textless{}} \DecValTok{1000}\NormalTok{) }\OperatorTok{\&} -\NormalTok{ (col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{1297}\NormalTok{) }\OperatorTok{\&} -\NormalTok{ (col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \StringTok{\textquotesingle{}2022{-}02{-}20\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-18|2022-12-18 08:45:30| 1297| 142.66| -| 2022-11-04|2022-11-04 20:00:34| 1297| 854.0| -| 2022-02-27|2022-02-27 13:27:44| 1297| 697.21| -+------------+-------------------+------------+-------------+ -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -This a \textbf{very important detail}, because it is very easy to -forget. When building your complex logical expressions in the ``python'' -way, always \textbf{remember to surround each expression by a pair of -parentheses}. Otherwise, you will get a very confusing and useless error -message, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{\textless{}} \DecValTok{1000} \OperatorTok{\&} -\NormalTok{ col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{1297} \OperatorTok{\&} -\NormalTok{ col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \StringTok{\textquotesingle{}2022{-}02{-}20\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Py4JError: An error occurred while calling o216.and. Trace: -py4j.Py4JException: Method and([class java.lang.Integer]) does no -t exist - at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngin -e.java:318) - at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngin -e.java:326) - at py4j.Gateway.invoke(Gateway.java:274) - at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand -.java:132) - at py4j.commands.CallCommand.execute(CallCommand.java:79) - at py4j.ClientServerConnection.waitForCommands(ClientServerCo -nnection.java:182) - at py4j.ClientServerConnection.run(ClientServerConnection.jav -a:106) - at java.base/java.lang.Thread.run(Thread.java:829) -\end{verbatim} - -In the above examples, we have logical expressions that are dependent on -each other. But, lets suppose these conditions were independent. In this -case, we would use the \texttt{or} keyword, instead of \texttt{and}. -Now, Spark will look for every row of \texttt{transf} where -\texttt{transferValue} is smaller than 1000, or, \texttt{clientNumber} -is equal to 1297, or, \texttt{dateTransfer} is greater than 20 of -February 2022. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{ transferValue \textless{} 1000} -\StringTok{ or clientNumber == 1297 } -\StringTok{ or dateTransfer \textgreater{} \textquotesingle{}2022{-}02{-}20\textquotesingle{}} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(condition)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -To translate this expression into the pythonic way, we have to -substitute the \texttt{or} keyword by the \texttt{\textbar{}} operator, -and surround each expression by parentheses again: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(} -\NormalTok{ (col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{\textless{}} \DecValTok{1000}\NormalTok{) }\OperatorTok{|} -\NormalTok{ (col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{1297}\NormalTok{) }\OperatorTok{|} -\NormalTok{ (col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \StringTok{\textquotesingle{}2022{-}02{-}20\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -You can also increase the complexity of your logical expressions by -mixing dependent expressions with independent expressions. For example, -to filter all the rows where \texttt{dateTransfer} is greater than or -equal to 01 of October 2022, and \texttt{clientNumber} is either 2727 or -5188, you would have the following code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{ (clientNumber == 2727 or clientNumber == 5188)} -\StringTok{ and dateTransfer \textgreater{}= \textquotesingle{}2022{-}10{-}01\textquotesingle{}} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(condition)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-29|2022-12-29 10:22:02| 2727| 4666.25| -| 2022-12-27|2022-12-27 03:58:25| 5188| 7821.69| -| 2022-12-26|2022-12-25 23:45:02| 2727| 3261.73| -| 2022-12-23|2022-12-23 05:32:49| 2727| 8042.0| -| 2022-12-22|2022-12-22 06:02:47| 5188| 8175.67| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -If you investigate the above condition carefully, maybe, you will -identify that this condition could be rewritten in a simpler format, by -using the \texttt{in} keyword. This way, Spark will look for all the -rows where \texttt{clientNumber} is equal to one of the listed values -(2727 or 5188), and, that \texttt{dateTransfer} is greater than or equal -to 01 of October 2022. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{ clientNumber in (2727, 5188)} -\StringTok{ and dateTransfer \textgreater{}= \textquotesingle{}2022{-}10{-}01\textquotesingle{}} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(condition)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-29|2022-12-29 10:22:02| 2727| 4666.25| -| 2022-12-27|2022-12-27 03:58:25| 5188| 7821.69| -| 2022-12-26|2022-12-25 23:45:02| 2727| 3261.73| -| 2022-12-23|2022-12-23 05:32:49| 2727| 8042.0| -| 2022-12-22|2022-12-22 06:02:47| 5188| 8175.67| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -\subsection{\texorpdfstring{Translating the \texttt{in} keyword to the -pythonic -way}{Translating the in keyword to the pythonic way}}\label{translating-the-in-keyword-to-the-pythonic-way} - -Python does have a \texttt{in} keyword just like SQL, but, this keyword -does not work as expected in \texttt{pyspark}. To write a logical -expression, using the pythonic way, that filters the rows where a column -is equal to one of the listed values, you can use the \texttt{isin()} -method. - -This method belongs to the \texttt{Column} class, so, you should always -use \texttt{isin()} after a column name or a \texttt{col()} function. In -the example below, we are filtering the rows where -\texttt{destinationBankNumber} is 290 or 666: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}destinationBankNumber\textquotesingle{}}\NormalTok{).isin(}\DecValTok{290}\NormalTok{, }\DecValTok{666}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:44:46| 1121| 7158.0| -| 2022-12-31|2022-12-31 01:02:06| 4862| 6714.0| -| 2022-12-31|2022-12-31 00:48:47| 3294| 10882.52| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -\subsection{Negating logical -conditions}\label{negating-logical-conditions} - -In some cases, is easier to describe what rows you \textbf{do not want} -in your filter. That is, you want to negate (or invert) your logical -expression. For this, SQL provides the \texttt{not} keyword, that you -place before the logical expression you want to negate. - -For example, we can filter all the rows of \texttt{transf} where -\texttt{clientNumber} is not equal to 3284. Remember, the methods -\texttt{filter()} and \texttt{where()} are equivalents or synonymous -(they both mean the same thing). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{ not clientNumber == 3284} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(condition)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -To translate this expression to the pythonic way, we use the -\texttt{\textasciitilde{}} operator. However, because we are negating -the logical expression as a whole, is important to surround the entire -expression with parentheses. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(}\OperatorTok{\textasciitilde{}}\NormalTok{(col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{3284}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -If you forget to add the parentheses, Spark will think you are negating -just the column, and not the entire expression. That would not make -sense, and, as a result, Spark would throw an error: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(}\OperatorTok{\textasciitilde{}}\NormalTok{col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{3284}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AnalysisException: cannot resolve '(NOT clientNumber)' due to dat -a type mismatch: argument 1 requires boolean type, however, 'clie -ntNumber' is of bigint type.; -'Filter (NOT clientNumber#210L = 3284) -\end{verbatim} - -Because the \texttt{\textasciitilde{}} operator is a little discrete and -can go unnoticed, I sometimes use a different approach to negate my -logical expressions. I make the entire expression equal to -\texttt{False}. This way, I get all the rows where that particular -expression is \texttt{False}. This makes my intention more visible in -the code, but, is harder to write it. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Filter all the rows where \textasciigrave{}clientNumber\textasciigrave{} is not equal to} -\CommentTok{\# 2727 or 5188.} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where( (col(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{).isin(}\DecValTok{2727}\NormalTok{, }\DecValTok{5188}\NormalTok{)) }\OperatorTok{==} \VariableTok{False}\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -\subsection{\texorpdfstring{Filtering \texttt{null} values (i.e.~missing -data)}{Filtering null values (i.e.~missing data)}}\label{sec-filter-null-values} - -Sometimes, the \texttt{null} values play an important role in your -filter. You either want to collect all these \texttt{null} values, so -you can investigate why they are null in the first place, or, you want -to completely eliminate them from your DataFrame. - -Because this is a special kind of value in Spark, with a special meaning -(the ``absence'' of a value), you need to use a special syntax to -correctly filter these values in your DataFrame. In SQL, you can use the -\texttt{is} keyword to filter these values: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(}\StringTok{\textquotesingle{}transferLog is null\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -However, if you want to remove these values from your DataFrame, then, -you can just negate (or invert) the above expression with the -\texttt{not} keyword, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(}\StringTok{\textquotesingle{}not transferLog is null\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-05|2022-12-05 00:51:00| 2197| 8240.62| -| 2022-09-20|2022-09-19 21:59:51| 5188| 7583.9| -| 2022-09-03|2022-09-03 06:07:59| 3795| 3654.0| -| 2022-07-02|2022-07-02 15:29:50| 4465| 5294.0| -| 2022-06-14|2022-06-14 10:21:55| 1121| 7302.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -The \texttt{is} and \texttt{not} keywords in SQL have a special -relation. Because you can create the same negation/inversion of the -expression by inserting the \texttt{not} keyword in the middle of the -expression (you can do this too in expressions with the \texttt{in} -keyword). In other words, you might see, in someone else's code, the -same expression above written in this form: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(}\StringTok{\textquotesingle{}transferLog is not null\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Both forms are equivalent and valid SQL logical expressions. But the -latter is a strange version. Because we cannot use the \texttt{not} -keyword in this manner on other kinds of logical expressions. Normally, -we put the \texttt{not} keyword \textbf{before} the logical expression -we want to negate, not in the middle of it. Anyway, just have in mind -that this form of logical expression exists, and, that is a perfectly -valid one. - -When we translate the above examples to the ``pythonic'' way, many -people tend to use the \texttt{null} equivalent of python, that is, the -\texttt{None} value, in the expression. But as you can see in the result -below, this method does not work as expected: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}transferLog\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \VariableTok{None}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+----------------+------------+-------------+ -|dateTransfer|datetimeTransfer|clientNumber|transferValue| -+------------+----------------+------------+-------------+ -+------------+----------------+------------+-------------+ -... with 6 more columns: transferCurrency, transferID, tr -ansferLog, destinationBankNumber, destinationBankBranch, -destinationBankAccount -\end{verbatim} - -The correct way to do this in \texttt{pyspark}, is to use the -\texttt{isNull()} method from the \texttt{Column} class. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}transferLog\textquotesingle{}}\NormalTok{).isNull())}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -If you want to eliminate the \texttt{null} values, just use the inverse -method \texttt{isNotNull()}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}transferLog\textquotesingle{}}\NormalTok{).isNotNull())}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-05|2022-12-05 00:51:00| 2197| 8240.62| -| 2022-09-20|2022-09-19 21:59:51| 5188| 7583.9| -| 2022-09-03|2022-09-03 06:07:59| 3795| 3654.0| -| 2022-07-02|2022-07-02 15:29:50| 4465| 5294.0| -| 2022-06-14|2022-06-14 10:21:55| 1121| 7302.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -\subsection{Filtering dates and datetimes in your -DataFrame}\label{filtering-dates-and-datetimes-in-your-dataframe} - -Just as a quick side note, when you want to filter rows of your -DataFrame that fits into a particular date, you can easily write this -particular date as a single string, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df\_0702 }\OperatorTok{=}\NormalTok{ transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \StringTok{\textquotesingle{}2022{-}07{-}02\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -When filtering datetimes you can also write the datetimes as strings, -like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{later\_0330\_pm }\OperatorTok{=}\NormalTok{ transf.where(} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}} \StringTok{\textquotesingle{}2022{-}07{-}02 03:30:00\textquotesingle{}} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -However, is a better practice to write these particular values using the -built-in \texttt{date} and \texttt{datetime} classes of python, like -this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ date, datetime} - -\NormalTok{d }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2022}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{7}\NormalTok{)} -\NormalTok{dt }\OperatorTok{=}\NormalTok{ datetime(}\DecValTok{2022}\NormalTok{,}\DecValTok{2}\NormalTok{,}\DecValTok{7}\NormalTok{,}\DecValTok{3}\NormalTok{,}\DecValTok{30}\NormalTok{,}\DecValTok{0}\NormalTok{)} - -\CommentTok{\# Filter all rows where \textasciigrave{}dateTransfer\textasciigrave{}} -\CommentTok{\# is equal to "2022{-}07{-}02"} -\NormalTok{df\_0702 }\OperatorTok{=}\NormalTok{ transf.where(} -\NormalTok{ col(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{==}\NormalTok{ d} -\NormalTok{)} - -\CommentTok{\# Filter all rows where \textasciigrave{}datetimeTransfer\textasciigrave{}} -\CommentTok{\# is greater than "2022{-}07{-}02 03:30:00"} -\NormalTok{later\_0330\_pm }\OperatorTok{=}\NormalTok{ transf.where(} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{) }\OperatorTok{\textgreater{}}\NormalTok{ dt} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -When you translate the above expressions to SQL, you can also write the -date and datetime values as strings. However, is also a good idea to use -the \texttt{CAST()} SQL function to convert these string values into the -correct data type before the actual filter. like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{condition\_d }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{dateTransfer == CAST("2022{-}07{-}02" AS DATE)} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{condition\_dt }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{dateTransfer \textgreater{} CAST("2022{-}07{-}02 03:30:00" AS TIMESTAMP)} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\CommentTok{\# Filter all rows where \textasciigrave{}dateTransfer\textasciigrave{}} -\CommentTok{\# is equal to "2022{-}07{-}02"} -\NormalTok{df\_0702 }\OperatorTok{=}\NormalTok{ transf.where(condition\_d)} - -\CommentTok{\# Filter all rows where \textasciigrave{}datetimeTransfer\textasciigrave{}} -\CommentTok{\# is greater than "2022{-}07{-}02 03:30:00"} -\NormalTok{later\_0330\_pm }\OperatorTok{=}\NormalTok{ transf.where(condition\_dt)} -\end{Highlighting} -\end{Shaded} - -In other words, with the SQL expression -\texttt{CAST("2022-07-02\ 03:30:00"\ AS\ TIMESTAMP)} we are telling -Spark to convert the literal string \texttt{"2022-07-02\ 03:30:00"} into -an actual timestamp value. - -\subsection{Searching for a particular pattern in string -values}\label{sec-filter-pattern-search} - -Spark offers different methods to search a particular pattern within a -string value. In this section, I want to describe how you can use these -different methods to find specific rows in your DataFrame, that fit into -the description of these patterns. - -\subsubsection{Starts with, ends with and -contains}\label{starts-with-ends-with-and-contains} - -You can use the column methods \texttt{startswith()}, -\texttt{endswith()} and \texttt{contains()}, to search for rows where a -input string value starts with, ends with, or, contains a particular -pattern, respectively. - -These three methods returns a boolean value that indicates if the input -string value matched the input pattern that you gave to the method. And -you can use these boolean values they return to filter the rows of your -DataFrame that fit into the description of these patterns. - -Just as an example, in the following code, we are creating a new -DataFrame called \texttt{persons}, that contains the description of 3 -persons (Alice, Bob and Charlie). And I use these three methods to -search for different rows in the DataFrame: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} - -\NormalTok{persons }\OperatorTok{=}\NormalTok{ spark.createDataFrame(} -\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}Alice\textquotesingle{}}\NormalTok{, }\DecValTok{25}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Bob\textquotesingle{}}\NormalTok{, }\DecValTok{30}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Charlie\textquotesingle{}}\NormalTok{, }\DecValTok{35}\NormalTok{)} -\NormalTok{ ],} -\NormalTok{ [}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{]} -\NormalTok{)} - -\CommentTok{\# Filter the DataFrame to include only rows} -\CommentTok{\# where the "name" column starts with "A"} -\NormalTok{persons.}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{).startswith(}\StringTok{\textquotesingle{}A\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+---+ -| name|age| -+-----+---+ -|Alice| 25| -+-----+---+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Filter the DataFrame to include only rows} -\CommentTok{\# where the "name" column ends with "e"} -\NormalTok{persons.}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{).endswith(}\StringTok{\textquotesingle{}e\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+---+ -| name|age| -+-------+---+ -| Alice| 25| -|Charlie| 35| -+-------+---+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Filter the DataFrame to include only rows} -\CommentTok{\# where the "name" column contains "ob"} -\NormalTok{persons.}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{).contains(}\StringTok{\textquotesingle{}ob\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----+---+ -|name|age| -+----+---+ -| Bob| 30| -+----+---+ -\end{verbatim} - -\subsubsection{Using regular expressions or SQL LIKE -patterns}\label{sec-filter-regex-pattern} - -In Spark, you can also use a particular ``SQL LIKE pattern'' or a -regular pattern (a.k.a. regex) to filter the rows of a DataFrame, by -using the \texttt{Column} methods \texttt{like()} and \texttt{rlike()}. - -In essence, the \texttt{like()} method is the \texttt{pyspark} -equivalent of the \texttt{LIKE} SQL operator. As a result, this -\texttt{like()} method expects a SQL pattern as input. This means that -you can use the SQL metacharacters \texttt{\%} (to match any number of -characters) and \texttt{\_} (to match exactly one character) inside this -pattern. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}transferCurrency\textquotesingle{}}\NormalTok{).like(}\StringTok{\textquotesingle{}british\%\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-30|2022-12-30 11:30:23| 1455| 5141.0| -| 2022-12-30|2022-12-30 02:35:23| 5986| 6076.0| -| 2022-12-29|2022-12-29 15:24:04| 4862| 5952.0| -| 2022-12-29|2022-12-29 14:16:46| 2197| 8771.0| -| 2022-12-29|2022-12-29 06:51:24| 5987| 2345.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -Although the style of pattern matching used by \texttt{like()} being -very powerful, you might need to use more powerful and flexible -patterns. In those cases, you can use the \texttt{rlike()} method, which -accepts a regular expression as input. In the example below, I am -filtering rows where \texttt{destinationBankAccount} starts by the -characters \texttt{54}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{regex }\OperatorTok{=} \StringTok{\textquotesingle{}\^{}54([0{-}9]}\SpecialCharTok{\{3\}}\StringTok{){-}[0{-}9]$\textquotesingle{}} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}destinationBankAccount\textquotesingle{}}\NormalTok{).rlike(regex))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-29|2022-12-29 02:54:23| 2197| 5752.0| -| 2022-12-27|2022-12-27 04:51:45| 4862| 11379.0| -| 2022-12-05|2022-12-05 05:50:27| 4965| 5986.0| -| 2022-12-04|2022-12-04 14:31:42| 4965| 8123.0| -| 2022-11-29|2022-11-29 16:23:07| 4862| 8060.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -\section{Selecting a subset of rows from your -DataFrame}\label{selecting-a-subset-of-rows-from-your-dataframe} - -At some point, you might need to use just a small piece of your -DataFrame over the next steps of your pipeline, and not the entire -thing. For example, you may want to select just the fisrt (or last) 5 -rows of this DataFrame, or, maybe, you need to take a random sample of -rows from it. - -In this section I will discuss the main methods offered by Spark to deal -with these scenarios. Each method returns a subset of rows from the -original DataFrame as a result. But each method works differently from -the other, and uses a different strategy to retrieve this subset. - -\subsection{Limiting the number of rows in your -DataFrame}\label{limiting-the-number-of-rows-in-your-dataframe} - -The \texttt{limit()} method is very similar to the \texttt{LIMIT} SQL -keyword. It limits the number of rows present in your DataFrame to a -specific amount. So, if I run \texttt{transf.limit(1)} I get a new -DataFrame as a result, which have only a single row from the -\texttt{transf} DataFrame. As you can see below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{single\_transfer }\OperatorTok{=}\NormalTok{ transf.limit(}\DecValTok{1}\NormalTok{)} -\NormalTok{single\_transfer.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -+------------+-------------------+------------+-------------+ -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -Is worth mentioning that the \texttt{limit()} method will always try to -limit your original DataFrame, to the first \(n\) rows. This means that -the command \texttt{df.limit(430)} tries to limit the \texttt{df} -DataFrame to its first 430 rows. - -This also means that 430 is the maximum number of rows that will be -taken from the \texttt{df} DataFrame. So, if \texttt{df} DataFrame has -less than 430 lines, like 14 rows, than, nothing will happen, i.e.~the -result of \texttt{df.limit(430)} will be equivalent to the \texttt{df} -DataFrame itself. - -\subsection{\texorpdfstring{Getting the first/last \(n\) rows of your -DataFrame}{Getting the first/last n rows of your DataFrame}}\label{getting-the-firstlast-n-rows-of-your-dataframe} - -The methods \texttt{head()} and \texttt{tail()} allows you to collect -the first/last \(n\) rows of your DataFrame, respectively. One key -aspect from these methods, is that they return a list of \texttt{Row} -values, instead of a new DataFrame (such as the \texttt{limit()} -method). You can compare these methods to the \texttt{take()} and -\texttt{collect()} methods that we introduced at -Section~\ref{sec-dataframe-actions}, because they both produce a list of -\texttt{Row} values as well. - -Now, the \texttt{head()} method produce the same output as the -\texttt{take()} method. However, these two methods work very differently -under the hoods, and, are recommended to be used in different scenarios. - -More specifically, if you have a big DataFrame (i.e.~a DataFrame with -many rows) is recommended to use \texttt{take()} (instead of -\texttt{head()}) to collect the first \(n\) rows from it. Because the -\texttt{head()} method makes Spark to load the entire DataFrame into the -driver's memory, and this can easily cause an ``out of memory'' -situation for big DataFrames. So, use the \texttt{head()} method only -for small DataFrames. - -In the example below, we are using these methods to get the first and -last 2 rows of the \texttt{transf} DataFrame: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# First 2 rows of \textasciigrave{}transf\textasciigrave{} DataFrame:} -\NormalTok{first2 }\OperatorTok{=}\NormalTok{ transf.head(}\DecValTok{2}\NormalTok{)} -\CommentTok{\# Last 2 rows of \textasciigrave{}transf\textasciigrave{} DataFrame:} -\NormalTok{last2 }\OperatorTok{=}\NormalTok{ transf.tail(}\DecValTok{2}\NormalTok{)} - -\BuiltInTok{print}\NormalTok{(last2)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[Row(dateTransfer=datetime.date(2022, 1, 1), - datetimeTransfer=datetime.datetime(2022, 1, 1, 4, 7, 44), - clientNumber=5987, transferValue=8640.06, - transferCurrency='dollar $', transferID=20221144, - transferLog=None, destinationBankNumber=666, - destinationBankBranch=6552, - destinationBankAccount='70021-4'), - Row(dateTransfer=datetime.date(2022, 1, 1), - datetimeTransfer=datetime.datetime(2022, 1, 1, 3, 56, 58), - clientNumber=6032, transferValue=5076.61, - transferCurrency='dollar $', transferID=20221143, - transferLog=None, destinationBankNumber=33, - destinationBankBranch=8800, - destinationBankAccount='41326-5')] -\end{verbatim} - -\subsection{Taking a random sample of your -DataFrame}\label{taking-a-random-sample-of-your-dataframe} - -With the \texttt{sample()} you can take a random sample of rows from -your DataFrame. In other words, this method returns a new DataFrame with -a random subset of rows from the original DataFrame. - -This method have three main arguments, which are: - -\begin{itemize} -\tightlist -\item - \texttt{withReplacement}: a boolean value indicating if the samples - are with replacement or not. Defaults to \texttt{False}; -\item - \texttt{fraction}: the fraction of rows you want to sample from the - DataFrame. Have to be a positive float value, from 0 to 1; -\item - \texttt{seed}: an integer representing the seed for the sampling - process. This is an optional argument; -\end{itemize} - -In the example below, we are trying to get a sample that represents 15\% -of the original \texttt{transf} DataFrame, and using the integer 24 as -our sampling seed: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf\_sample }\OperatorTok{=}\NormalTok{ transf.sample(fraction }\OperatorTok{=} \FloatTok{0.15}\NormalTok{, seed }\OperatorTok{=} \DecValTok{24}\NormalTok{)} -\NormalTok{transf\_sample.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 01:02:06| 4862| 6714.0| -| 2022-12-30|2022-12-30 00:18:25| 5832| 6333.0| -| 2022-12-29|2022-12-29 06:51:24| 5987| 2345.0| -| 2022-12-27|2022-12-27 14:08:01| 3294| 6617.17| -| 2022-12-26|2022-12-26 11:25:09| 5832| 8178.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -In other words, the \texttt{fraction} argument represents a fraction of -the total number of rows in the original DataFrame. Since the -\texttt{transf} DataFrame have 2421 rows in total, by setting the -\texttt{fraction} argument to 0.15, we are asking Spark to collect a -sample from \texttt{transf} that have approximately -\(0.15 \times 2421 \approx 363\) rows. - -If we calculate the number of rows in \texttt{transf\_sample} DataFrame, -we can see that this DataFrame have a number of rows close to 363: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf\_sample.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -355 -\end{verbatim} - -Furthermore, the sampling seed is just a way to ask Spark to produce the -same random sample of the original DataFrame. That is, the sampling seed -makes the result sample fixed. You always get the same random sample -when you run the \texttt{sample()} method. - -On the other hand, when you do not set the \texttt{seed} argument, then, -Spark will likely produce a different random sample of the original -DataFrame every time you run the \texttt{sample()} method. - -\section{Managing the columns of your -DataFrame}\label{managing-the-columns-of-your-dataframe} - -Sometimes, you need manage or transform the columns you have. For -example, you might need to change the order of these columns, or, to -delete/rename some of them. To do this, you can use the -\texttt{select()} and \texttt{drop()} methods of your DataFrame. - -The \texttt{select()} method works very similarly to the \texttt{SELECT} -statement of SQL. You basically list all the columns you want to keep in -your DataFrame, in the specific order you want. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+-------------------+------------+-------------+ -|transferID| datetimeTransfer|clientNumber|transferValue| -+----------+-------------------+------------+-------------+ -| 20223563|2022-12-31 14:00:24| 5516| 7794.31| -| 20223562|2022-12-31 10:32:07| 4965| 7919.0| -| 20223561|2022-12-31 07:37:02| 4608| 5603.0| -| 20223560|2022-12-31 07:35:05| 1121| 4365.22| -| 20223559|2022-12-31 02:53:44| 1121| 4620.0| -+----------+-------------------+------------+-------------+ -only showing top 5 rows -\end{verbatim} - -\subsection{Renaming your columns}\label{renaming-your-columns} - -Realize in the example above, that the column names can be delivered -directly as strings to \texttt{select()}. This makes life pretty easy, -but, it does not give you extra options. - -For example, you might want to rename some of the columns, and, to do -this, you need to use the \texttt{alias()} method from \texttt{Column} -class. Since this is a method from \texttt{Column} class, you need to -use it after a \texttt{col()} or \texttt{column()} function, or, after a -column name using the dot operator. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}ID\_of\_transfer\textquotesingle{}}\NormalTok{),} -\NormalTok{ transf.clientNumber.alias(}\StringTok{\textquotesingle{}clientID\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+--------------+--------+ -| datetimeTransfer|ID_of_transfer|clientID| -+-------------------+--------------+--------+ -|2022-12-31 14:00:24| 20223563| 5516| -|2022-12-31 10:32:07| 20223562| 4965| -|2022-12-31 07:37:02| 20223561| 4608| -|2022-12-31 07:35:05| 20223560| 1121| -|2022-12-31 02:53:44| 20223559| 1121| -+-------------------+--------------+--------+ -only showing top 5 rows -\end{verbatim} - -By using this \texttt{alias()} method, you can rename multiple columns -within a single \texttt{select()} call. However, you can use the -\texttt{withColumnRenamed()} method to rename just a single column of -your DataFrame. The first argument of this method, is the current name -of this column, and, the second argument, is the new name of this -column. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumnRenamed(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}clientID\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+--------+-------------+ -|dateTransfer| datetimeTransfer|clientID|transferValue| -+------------+-------------------+--------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+--------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, t -ransferLog, destinationBankNumber, destinationBankBranch -, destinationBankAccount -\end{verbatim} - -\subsection{Dropping unnecessary -columns}\label{dropping-unnecessary-columns} - -In some cases, your DataFrame just have too many columns and you just -want to eliminate a few of them. In a situation like this, you can list -the columns you want to drop from your DataFrame, inside the -\texttt{drop()} method, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .drop(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------+----------------+----------+ -| datetimeTransfer|transferValue|transferCurrency|transferID| -+-------------------+-------------+----------------+----------+ -|2022-12-31 14:00:24| 7794.31| zing ƒ| 20223563| -|2022-12-31 10:32:07| 7919.0| zing ƒ| 20223562| -|2022-12-31 07:37:02| 5603.0| dollar $| 20223561| -|2022-12-31 07:35:05| 4365.22| dollar $| 20223560| -|2022-12-31 02:53:44| 4620.0| dollar $| 20223559| -+-------------------+-------------+----------------+----------+ -only showing top 5 rows -... with 4 more columns: transferLog, destinationBankNumber, d -estinationBankBranch, destinationBankAccount -\end{verbatim} - -\subsection{Casting columns to a different data -type}\label{sec-cast-column-type} - -Spark try to do its best when guessing which is correct data type for -the columns of your DataFrame. But, obviously, Spark can get it wrong, -and, you end up deciding by your own which data type to use for a -specific column. - -To explicit transform a column to a specific data type, you can use -\texttt{cast()} or \texttt{astype()} methods inside \texttt{select()}. -The \texttt{astype()} method is just an alias to \texttt{cast()}. This -\texttt{cast()} method is very similar to the \texttt{CAST()} function -in SQL, and belongs to the \texttt{Column} class, so, you should always -use it after a column name with the dot operator, or, a -\texttt{col()}/\texttt{column()} function: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}long\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}value\_as\_integer\textquotesingle{}}\NormalTok{),} -\NormalTok{ transf.transferValue.cast(}\StringTok{\textquotesingle{}boolean\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}value\_as\_boolean\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------+----------------+----------------+ -|transferValue|value_as_integer|value_as_boolean| -+-------------+----------------+----------------+ -| 7794.31| 7794| true| -| 7919.0| 7919| true| -| 5603.0| 5603| true| -| 4365.22| 4365| true| -| 4620.0| 4620| true| -+-------------+----------------+----------------+ -only showing top 5 rows -\end{verbatim} - -To use \texttt{cast()} or \texttt{astype()} methods, you give the name -of the data type (as a string) to which you want to cast the column. The -main available data types to \texttt{cast()} or \texttt{astype()} are: - -\begin{itemize} -\tightlist -\item - \texttt{\textquotesingle{}string\textquotesingle{}}: correspond to - \texttt{StringType()}; -\item - \texttt{\textquotesingle{}int\textquotesingle{}}: correspond to - \texttt{IntegerType()}; -\item - \texttt{\textquotesingle{}long\textquotesingle{}}: correspond to - \texttt{LongType()}; -\item - \texttt{\textquotesingle{}double\textquotesingle{}}: correspond to - \texttt{DoubleType()}; -\item - \texttt{\textquotesingle{}date\textquotesingle{}}: correspond to - \texttt{DateType()}; -\item - \texttt{\textquotesingle{}timestamp\textquotesingle{}}: correspond to - \texttt{TimestampType()}; -\item - \texttt{\textquotesingle{}boolean\textquotesingle{}}: correspond to - \texttt{BooleanType()}; -\item - \texttt{\textquotesingle{}array\textquotesingle{}}: correspond to - \texttt{ArrayType()}; -\item - \texttt{\textquotesingle{}dict\textquotesingle{}}: correspond to - \texttt{MapType()}; -\end{itemize} - -\subsection{\texorpdfstring{You can add new columns with -\texttt{select()}}{You can add new columns with select()}}\label{you-can-add-new-columns-with-select} - -When I said that \texttt{select()} works in the same way as the -\texttt{SELECT} statement of SQL, I also meant that you can use -\texttt{select()} to select columns that do not currently exist in your -DataFrame, and add them to the final result. - -For example, I can select a new column (called \texttt{by\_1000}) -containing \texttt{value} divided by 1000, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{,} -\NormalTok{ (col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{/} \DecValTok{1000}\NormalTok{).alias(}\StringTok{\textquotesingle{}by\_1000\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------+-------+ -|transferValue|by_1000| -+-------------+-------+ -| 7794.31|7.79431| -| 7919.0| 7.919| -| 5603.0| 5.603| -| 4365.22|4.36522| -| 4620.0| 4.62| -+-------------+-------+ -only showing top 5 rows -\end{verbatim} - -This \texttt{by\_1000} column do not exist in \texttt{transf} DataFrame. -It was calculated and added to the final result by \texttt{select()}. -The formula -\texttt{col(\textquotesingle{}transferValue\textquotesingle{})\ /\ 1000} -is the equation that defines what this \texttt{by\_1000} column is, or, -how it should be calculated. - -Besides that, \texttt{select()} provides a useful shortcut to reference -all the columns of your DataFrame. That is, the star symbol (\texttt{*}) -from the \texttt{SELECT} statement in SQL. This shortcut is very useful -when you want to maintain all columns, and, add a new column, at the -same time. - -In the example below, we are adding the same \texttt{by\_1000} column, -however, we are bringing all the columns of \texttt{transf} together. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}*\textquotesingle{}}\NormalTok{,} -\NormalTok{ (col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{/} \DecValTok{1000}\NormalTok{).alias(}\StringTok{\textquotesingle{}by\_1000\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 7 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount, by_1000 -\end{verbatim} - -\section{Calculating or adding new columns to your -DataFrame}\label{calculating-or-adding-new-columns-to-your-dataframe} - -Although you can add new columns with \texttt{select()}, this method is -not specialized to do that. As consequence, when you want to add many -new columns, it can become pretty annoying to write -\texttt{select(\textquotesingle{}*\textquotesingle{},\ new\_column)} -over and over again. That is why \texttt{pyspark} provides a special -method called \texttt{withColumn()}. - -This method has two arguments. First, is the name of the new column. -Second, is the formula (or the equation) that represents this new -column. As an example, I could reproduce the same \texttt{by\_1000} -column like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}by\_1000\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{) }\OperatorTok{/} \DecValTok{1000}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 7 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount, by_1000 -\end{verbatim} - -A lot of the times we use the functions from -\texttt{pyspark.sql.functions} module to produce such formulas used by -\texttt{withColumn()}. You can checkout the complete list of functions -present in this module, by visiting the official documentation of -\texttt{pyspark}\footnote{\url{https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html\#functions}}. - -You will see a lot more examples of formulas and uses of -\texttt{withColumn()} throughout this book. For now, I just want you to -know that \texttt{withColumn()} is a method that adds a new column to -your DataFrame. The first argument is the name of the new column, and, -the second argument is the calculation formula of this new column. - -\section{Sorting rows of your -DataFrame}\label{sorting-rows-of-your-dataframe} - -Spark, or, \texttt{pyspark}, provides the \texttt{orderBy()} and -\texttt{sort()} DataFrame method to sort rows. They both work the same -way: you just give the name of the columns that Spark will use to sort -the rows. - -In the example below, Spark will sort \texttt{transf} according to the -values in the \texttt{transferValue} column. By default, Spark uses an -ascending order while sorting your rows. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-07-22|2022-07-22 16:06:25| 3795| 60.0| -| 2022-05-09|2022-05-09 14:02:15| 3284| 104.0| -| 2022-09-16|2022-09-16 20:35:40| 3294| 129.09| -| 2022-12-18|2022-12-18 08:45:30| 1297| 142.66| -| 2022-08-20|2022-08-20 09:27:55| 2727| 160.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -Just to be clear, you can use the combination between multiple columns -to sort your rows. Just give the name of each column (as strings) -separated by commas. In the example below, Spark will sort the rows -according to \texttt{clientNumber} column first, then, is going to sort -the rows of each \texttt{clientNumber} according to -\texttt{transferValue} column. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-03-30|2022-03-30 11:57:22| 1121| 461.0| -| 2022-05-23|2022-05-23 11:51:02| 1121| 844.66| -| 2022-08-24|2022-08-24 13:51:30| 1121| 1046.93| -| 2022-09-23|2022-09-23 19:49:19| 1121| 1327.0| -| 2022-06-25|2022-06-25 17:07:08| 1121| 1421.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -If you want to use a descending order in a specific column, you need to -use the \texttt{desc()} method from \texttt{Column} class. In the -example below, Spark will sort the rows according to -\texttt{clientNumber} column using an ascending order. However, it will -use the values from \texttt{transferValue} column in a descending order -to sort the rows in each \texttt{clientNumber}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).desc())}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-08-18|2022-08-18 13:57:12| 1121| 11490.37| -| 2022-11-05|2022-11-05 08:00:37| 1121| 10649.59| -| 2022-05-17|2022-05-17 10:27:05| 1121| 10471.23| -| 2022-05-15|2022-05-15 00:25:49| 1121| 10356.0| -| 2022-06-10|2022-06-09 23:51:39| 1121| 10142.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -This means that you can mix ascending orders with descending orders in -\texttt{orderBy()}. Since the ascending order is the default, if you -want to use a descending order in all of the columns, then, you need to -apply the \texttt{desc()} method to all of them. - -\section{Calculating aggregates}\label{calculating-aggregates} - -To calculate aggregates of a Spark DataFrame we have two main paths: 1) -we can use some standard DataFrame methods, like \texttt{count()} or -\texttt{sum()}, to calculate a single aggregate; 2) or, we can use the -\texttt{agg()} method to calculate multiple aggregates at the same time. - -\subsection{Using standard DataFrame -methods}\label{using-standard-dataframe-methods} - -The Spark DataFrame class by itself provides a single aggregate method, -which is \texttt{count()}. With this method, you can find out how many -rows your DataFrame have. In the example below, we can see that -\texttt{transf} have 2421 rows. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -2421 -\end{verbatim} - -However, if you have a \textbf{grouped} DataFrame (we will learn more -about these objects very soon), \texttt{pyspark} provides some more -aggregate methods, which are listed below: - -\begin{itemize} -\tightlist -\item - \texttt{mean()}: calculate the average value of each numeric column; -\item - \texttt{sum()}: return the total sum of a column; -\item - \texttt{count()}: count the number of rows; -\item - \texttt{max()}: compute the maximum value of a column; -\item - \texttt{min()}: compute the minimum value of a column; -\end{itemize} - -This means that you can use any of the above methods after a -\texttt{groupby()} call, to calculate aggregates \emph{per group} in -your Spark DataFrame. For now, lets forget about this ``groupby'' -detail, and learn how to calculate different aggregates by using the -\texttt{agg()} method. - -\subsection{\texorpdfstring{Using the \texttt{agg()} -method}{Using the agg() method}}\label{sec-agg-method} - -With the \texttt{agg()} method, we can calculate many different -aggregates at the same time. In this method, you should provide a -expression that describes what aggregate measure you want to calculate. - -In most cases, this ``aggregate expression'' will be composed of -functions from the \texttt{pyspark.sql.functions} module. So, having -familiarity with the functions present in this module will help you to -compose the formulas of your aggregations in \texttt{agg()}. - -In the example below, I am using the \texttt{sum()} and \texttt{mean()} -functions from \texttt{pyspark.sql.functions} to calculate the total sum -and the total mean of the \texttt{transferValue} column in the -\texttt{transf} DataFrame. I am also using the \texttt{countDistinct()} -function to calculate the number of distinct values in the -\texttt{clientNumber} column. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ pyspark.sql.functions }\ImportTok{as}\NormalTok{ F} - -\NormalTok{transf.agg(} -\NormalTok{ F.}\BuiltInTok{sum}\NormalTok{(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}total\_value\textquotesingle{}}\NormalTok{),} -\NormalTok{ F.mean(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}mean\_value\textquotesingle{}}\NormalTok{),} -\NormalTok{ F.countDistinct(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}number\_of\_clients\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+-----------------+-----------------+ -| total_value| mean_value|number_of_clients| -+--------------------+-----------------+-----------------+ -|1.5217690679999998E7|6285.704535315985| 26| -+--------------------+-----------------+-----------------+ -\end{verbatim} - -\subsection{Without groups, we calculate a aggregate of the entire -DataFrame}\label{without-groups-we-calculate-a-aggregate-of-the-entire-dataframe} - -When we do not define any group for the input DataFrame, \texttt{agg()} -always produce a new DataFrame with one single row (like in the above -example). This happens because we are calculating aggregates of the -entire DataFrame, that is, a set of single values (or single measures) -that summarizes (in some way) the entire DataFrame. - -In the other hand, when we define groups in a DataFrame (by using the -\texttt{groupby()} method), the calculations performed by \texttt{agg()} -are made inside each group in the DataFrame. In other words, instead of -summarizing the entire DataFrame, \texttt{agg()} will produce a set of -single values that describes (or summarizes) each group in the -DataFrame. - -This means that each row in the resulting DataFrame describes a specific -group in the original DataFrame, and, \texttt{agg()} usually produces a -DataFrame with more than one single row when its calculations are -performed by group. Because our DataFrames usually have more than one -single group. - -\subsection{Calculating aggregates per group in your -DataFrame}\label{sec-group-by} - -But how you define the groups inside your DataFrame? To do this, we use -the \texttt{groupby()} and \texttt{groupBy()} methods. These methods are -both synonymous (they do the same thing). - -These methods, produce a \textbf{grouped} DataFrame as a result, or, in -more technical words, a object of class -\texttt{pyspark.sql.group.GroupedData}. You just need to provide, inside -this \texttt{groupby()} method, the name of the columns that define (or -``mark'') your groups. - -In the example below, I am creating a grouped DataFrame per client -defined in the \texttt{clientNumber} column. This means that each -distinct value in the \texttt{clientNumber} column defines a different -group in the DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf\_per\_client }\OperatorTok{=}\NormalTok{ transf.groupBy(}\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{)} -\NormalTok{transf\_per\_client} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -GroupedData[grouping expressions: [clientNumber], value: [dateTra -nsfer: date, datetimeTransfer: timestamp ... 8 more fields], type -: GroupBy] -\end{verbatim} - -At first, it appears that nothing has changed. But the -\texttt{groupBy()} method always returns a new object of class -\texttt{pyspark.sql.group.GroupedData}. As a consequence, we can no -longer use some of the DataFrame methods that we used before, like the -\texttt{show()} method to see the DataFrame. - -That's ok, because we usually do not want to keep this grouped DataFrame -for much time. This grouped DataFrame is just a passage (or a bridge) to -the result we want, which is, to calculate aggregates \textbf{per group} -of the DataFrame. - -As an example, I can use the \texttt{max()} method, to find out which is -the highest value that each user have tranfered, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf\_per\_client}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{max}\NormalTok{(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+------------------+ -|clientNumber|max(transferValue)| -+------------+------------------+ -| 1217| 12601.0| -| 2489| 12644.56| -| 3284| 12531.84| -| 4608| 10968.31| -| 1297| 11761.0| -+------------+------------------+ -only showing top 5 rows -\end{verbatim} - -Remember that, by using standard DataFrame methods (like \texttt{max()} -in the example above) we can calculate only a single aggregate value. -But, with \texttt{agg()} we can calculate more than one aggregate value -at the same time. Since our \texttt{transf\_per\_client} object is a -\textbf{grouped} DataFrame, \texttt{agg()} will calculate these -aggregates per group. - -As an example, if I apply \texttt{agg()} with the exact same expressions -exposed on Section~\ref{sec-agg-method} with the -\texttt{transf\_per\_client} DataFrame, instead of a DataFrame with one -single row, I get a new DataFrame with nine rows. In each row, I have -the total and mean values for a specific user in the input DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf\_per\_client}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(} -\NormalTok{ F.}\BuiltInTok{sum}\NormalTok{(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}total\_value\textquotesingle{}}\NormalTok{),} -\NormalTok{ F.mean(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}mean\_value\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-----------------+------------------+ -|clientNumber| total_value| mean_value| -+------------+-----------------+------------------+ -| 1217|575218.2099999998| 6185.142043010751| -| 2489|546543.0900000001| 6355.152209302327| -| 3284|581192.5700000001| 6054.089270833334| -| 4608| 448784.44| 6233.117222222222| -| 1297|594869.6699999999|6196.5590624999995| -+------------+-----------------+------------------+ -only showing top 5 rows -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{Importing data to Spark}\label{sec-import} - -Another way of creating Spark DataFrames, is to read (or import) data -from somewhere and convert it to a Spark DataFrame. Spark can read a -variety of file formats, including CSV, Parquet, JSON, ORC and Binary -files. Furthermore, Spark can connect to other databases and import -tables from them, using ODBC/JDBC connections. - -To read (or import) any data to Spark, we use a ``read engine'', and -there are many different read engines available in Spark. Each engine is -used to read a specific file format, or to import data from a specific -type of data source, and we access these engines by using the -\texttt{read} module from your Spark Session object. - -\section{Reading data from static files}\label{sec-read-files} - -Static files are probably the easiest way to transport data from one -computer to another. Because you just need to copy and paste this file -to the other machine, or download it from the internet. - -But in order to Spark read any type of static file stored inside your -computer, \textbf{it always need to know the path to this file}. Every -OS have its own file system, and every file in your computer is stored -in a specific address of this file system. The ``path'' to this file is -the path (or ``steps'') that your computer needs to follow to reach this -specific address, where the file is stored. - -As we pointed out earlier, to read any static file in Spark, you use one -of the available ``read engines'', which are in the \texttt{spark.read} -module of your Spark Session. This means that, each read engine in this -module is responsible for reading a specific file format. - -If you want to read a CSV file for example, you use the -\texttt{spark.read.csv()} engine. In contrast, if you want to read a -JSON file, you use the \texttt{spark.read.json()} engine instead. But no -matter what read engine you use, you always give the path to your file -to any of these ``read engines''. - -The main read engines available in Spark are: - -\begin{itemize} -\tightlist -\item - \texttt{spark.read.json()}: to read JSON files. -\item - \texttt{spark.read.csv()}: to read CSV files. -\item - \texttt{spark.read.parquet()}: to read Apache Parquet files. -\item - \texttt{spark.read.orc()}: to read ORC (Apache \emph{Optimized Row - Columnar} format) files. -\item - \texttt{spark.read.text()}: to read text files. -\item - \texttt{spark.read.jdbc()}: to read data from databases using the JDBC - API. -\end{itemize} - -For example, to read a JSON file called \texttt{sales.json} that is -stored in my \texttt{Data} folder, I can do this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{json\_data }\OperatorTok{=}\NormalTok{ spark.read.json(}\StringTok{"../Data/sales.json"}\NormalTok{)} -\NormalTok{json\_data.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+----------+------------+-------+-------------------+-----+ -|price|product_id|product_name|sale_id| timestamp|units| -+-----+----------+------------+-------+-------------------+-----+ -| 3.12| 134| Milk 1L Mua| 328711|2022-02-01T22:10:02| 1| -| 1.22| 110| Coke 350ml| 328712|2022-02-03T11:42:09| 3| -| 4.65| 117| Pepsi 2L| 328713|2022-02-03T14:22:15| 1| -| 1.22| 110| Coke 350ml| 328714|2022-02-03T18:33:08| 1| -| 0.85| 341|Trident Mint| 328715|2022-02-04T15:41:36| 1| -+-----+----------+------------+-------+-------------------+-----+ -\end{verbatim} - -\section{An example with a CSV file}\label{an-example-with-a-csv-file} - -As an example, I have the following CSV file saved in my computer: - -\begin{verbatim} -name,age,job -Jorge,30,Developer -Bob,32,Developer -\end{verbatim} - -This CSV was saved in a file called \texttt{people.csv}, inside a folder -called \texttt{Data}. So, to read this static file, Spark needs to know -the path to this \texttt{people.csv} file. In other words, Spark needs -to know where this file is stored in my computer, to be able to read it. - -In my specific case, considering where this \texttt{Data} folder is in -my computer, a relative path to it would be \texttt{"../Data/"}. Having -the path to the folder where \texttt{people.csv} is stored, I just need -to add this file to the path, resulting in -\texttt{"../Data/people.csv"}. See in the example below, that I gave -this path to the \texttt{read.csv()} method of my Spark Session. As a -result, Spark will visit this address, and, read the file that is stored -there: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession.builder.getOrCreate()} - -\NormalTok{path }\OperatorTok{=} \StringTok{"../Data/people.csv"} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.read.csv(path)} -\NormalTok{df.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+---+---------+ -| _c0|_c1| _c2| -+-----+---+---------+ -| name|age| job| -|Jorge| 30|Developer| -| Bob| 32|Developer| -+-----+---+---------+ -\end{verbatim} - -In the above example, I gave a relative path to the file I wanted to -read. But you can provide an absolute path\footnote{That is, the - complete path to the file, or, in other words, a path that starts in - the root folder of your hard drive.} to the file, if you want to. The -\texttt{people.csv} is located at a very specific folder in my Linux -computer, so, the absolute path to this file is pretty long as you can -see below. But, if I were in my Windows machine, this absolute path -would be something like -\texttt{"C:\textbackslash{}Users\textbackslash{}pedro\textbackslash{}Documents\textbackslash{}Projects\textbackslash{}..."}. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# The absolute path to \textasciigrave{}people.csv\textasciigrave{}:} -\NormalTok{path }\OperatorTok{=} \StringTok{"/home/pedro/Documents/Projects/Books/"} -\NormalTok{path }\OperatorTok{=}\NormalTok{ path }\OperatorTok{+} \StringTok{"Introd{-}pyspark/Data/people.csv"} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.read.csv(path)} -\NormalTok{df.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+---+---------+ -| _c0|_c1| _c2| -+-----+---+---------+ -| name|age| job| -|Jorge| 30|Developer| -| Bob| 32|Developer| -+-----+---+---------+ -\end{verbatim} - -If you give an invalid path (that is, a path that does not exist in your -computer), you will get a \texttt{AnalysisException}. In the example -below, I try to read a file called \texttt{"weird-file.csv"} that (in -theory) is located at my current working directory. But when Spark looks -inside my current directory, it does not find any file called -\texttt{"weird-file.csv"}. As a result, Spark raises a -\texttt{AnalysisException} that warns me about this mistake. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.read.csv(}\StringTok{"weird{-}file.csv"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Traceback (most recent call last): -pyspark.sql.utils.AnalysisException: -Path does not exist: -file:/home/pedro/Documents/Projects/Books/Introd-pyspark/weird-fi -le.csv -\end{verbatim} - -Every time you face this ``Path does not exist'' error, it means that -Spark did not found the file that you described in the path you gave to -\texttt{spark.read}. In this case, is very likely that you have a typo -or a mistake in your path. Maybe your forgot to add the \texttt{.csv} -extension to the name of your file. Or maybe you forgot to use the right -angled slash (\texttt{/}) instead of the left angled slash -(\texttt{\textbackslash{}}). Or maybe, you gave the path to folder -\(x\), when in fact, you wanted to reach the folder \(y\). - -Sometimes, is useful to list all the files that are stored inside the -folder you are trying to access. This way, you can make sure that you -are looking at the right folder of your file system. To do that, you can -use the \texttt{listdir()} function from \texttt{os} module of python. -As an example, I can list all the files that are stored inside of the -\texttt{Data} folder in this way: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ os }\ImportTok{import}\NormalTok{ listdir} -\NormalTok{listdir(}\StringTok{"../Data/"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['accounts.csv', - 'books.txt', - 'livros.txt', - 'logs.json', - 'penguins.csv', - 'people.csv', - 'sales.json', - 'transf.csv', - 'transf_reform.csv', - 'user-events.json'] -\end{verbatim} - -\section{Import options}\label{import-options} - -While reading and importing data from any type of data source, Spark -will always use the default values for each import option defined by the -read engine you are using, unless you explicit ask it to use a different -value. Each read engine has its own read/import options. - -For example, the \texttt{spark.read.orc()} engine has a option called -\texttt{mergeSchema}. With this option, you can ask Spark to merge the -schemas collected from all the ORC part-files. In contrast, the -\texttt{spark.read.csv()} engine does not have such option. Because this -functionality of ``merging schemas'' does not make sense with CSV files. - -This means that, some import options are specific (or characteristic) of -some file formats. For example, the \texttt{sep} option (where you -define the \emph{separator} character) is used only in the -\texttt{spark.read.csv()} engine. Because you do not have a special -character that behaves as the ``separator'' in the other file formats -(like ORC, JSON, Parquet\ldots). So it does not make sense to have such -option in the other read engines. - -In the other hand, some import options can co-exist in multiple read -engines. For example, the \texttt{spark.read.json()} and -\texttt{spark.read.csv()} have both an \texttt{encoding} option. The -encoding is a very important information, and Spark needs it to -correctly interpret your file. By default, Spark will always assume that -your files use the UTF-8 encoding system. Although, this may not be true -for your specific case, and for these cases you use this -\texttt{encoding} option to tell Spark which one to use. - -In the next sections, I will break down some of the most used import -options for each file format. If you want to see the complete list of -import options, you can visit the \emph{Data Source Option} section in -the specific part of the file format you are using in the Spark SQL -Guide\footnote{For example, this \emph{Data Source Option} for Parquet - files is located at: - \url{https://spark.apache.org/docs/latest/sql-data-sources-parquet.html\#data-source-option}}. - -To define, or, set a specific import option, you use the -\texttt{option()} method from a \texttt{DataFrameReader} object. To -produce this kind of object, you use the \texttt{spark.read} module, -like in the example below. Each call to the \texttt{option()} method is -used to set a single import option. - -Notice that the ``read engine'' of Spark (i.e.~\texttt{csv()}) is the -last method called at this chain (or sequence) of steps. In other words, -you start by creating a \texttt{DataFrameReader} object, then, set the -import options, and lastly, you define which ``read engine'' you want to -use. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Creating a \textasciigrave{}DataFrameReader\textasciigrave{} object:} -\NormalTok{df\_reader }\OperatorTok{=}\NormalTok{ spark.read} -\CommentTok{\# Setting the import options:} -\NormalTok{df\_reader }\OperatorTok{=}\NormalTok{ df\_reader}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"sep"}\NormalTok{, }\StringTok{"$"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"locale"}\NormalTok{, }\StringTok{"pt{-}BR"}\NormalTok{)} - -\CommentTok{\# Setting the "read engine" to be used with \textasciigrave{}.csv()\textasciigrave{}:} -\NormalTok{my\_data }\OperatorTok{=}\NormalTok{ df\_reader}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"../Data/a{-}csv{-}file.csv"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -If you prefer, you can also merge all these calls together like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark.read\textbackslash{} }\CommentTok{\# a \textasciigrave{}DataFrameReader\textasciigrave{} object} -\NormalTok{ .option(}\StringTok{"sep"}\NormalTok{, }\StringTok{"$"}\NormalTok{)\textbackslash{} }\CommentTok{\# Setting the \textasciigrave{}sep\textasciigrave{} option} -\NormalTok{ .option(}\StringTok{"locale"}\NormalTok{, }\StringTok{"pt{-}BR"}\NormalTok{)\textbackslash{} }\CommentTok{\# Setting the \textasciigrave{}locale\textasciigrave{} option} -\NormalTok{ .csv(}\StringTok{"../Data/a{-}csv{-}file.csv"}\NormalTok{) }\CommentTok{\# The "read engine" to be used} -\end{Highlighting} -\end{Shaded} - -There are many different import options for each read engine, and you -can see the full list in the official documentation for -Spark\footnote{\url{https://spark.apache.org/docs/latest/sql-data-sources-csv.html}}. -But lets just give you a brief overview of the probably most popular -import options: - -\begin{itemize} -\tightlist -\item - \texttt{sep}: sets the separator character for each field and value in - the CSV file (defaults to \texttt{","}); -\item - \texttt{encoding}: sets the character encoding of the file to be read - (defaults to \texttt{"UTF-8"}); -\item - \texttt{header}: boolean (defaults to \texttt{False}), should Spark - consider the first line of the file as the header of the DataFrame - (i.e.~the name of the columns) ? -\item - \texttt{dateFormat} and \texttt{timestampFormat}: sets the format for - dates and timestamps in the file (defaults to \texttt{"yyyy-MM-dd"} - and - \texttt{"yyyy-MM-dd\textquotesingle{}T\textquotesingle{}HH:mm:ss{[}.SSS{]}{[}XXX{]}"} - respectively); -\end{itemize} - -\section{Setting the separator character for CSV -files}\label{setting-the-separator-character-for-csv-files} - -In this section, we will use the \texttt{transf\_reform.csv} file to -demonstrate how to set the separator character of a CSV file. This file, -contains some data of transfers made in a fictitious bank. Is worth -mentioning that this \texttt{sep} import option is only available for -CSV files. - -Lets use the \texttt{peek\_file()} function defined below to get a quick -peek at the first 5 lines of this file. If you look closely to these -lines, you can identify that this CSV files uses the \texttt{";"} -character to separate fields and values, and not the american standard -\texttt{","} character. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{def}\NormalTok{ peek\_file(path, n\_lines }\OperatorTok{=} \DecValTok{5}\NormalTok{):} - \ControlFlowTok{with} \BuiltInTok{open}\NormalTok{(path) }\ImportTok{as} \BuiltInTok{file}\NormalTok{:} -\NormalTok{ lines }\OperatorTok{=}\NormalTok{ [}\BuiltInTok{next}\NormalTok{(}\BuiltInTok{file}\NormalTok{) }\ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(n\_lines)]} -\NormalTok{ text }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{.join(lines)} - \BuiltInTok{print}\NormalTok{(text)} - -\NormalTok{peek\_file(}\StringTok{"../Data/transf\_reform.csv"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -datetime;user;value;transferid;country;description -2018-12-06T22:19:19Z;Eduardo;598.5984;116241629;Germany; -2018-12-06T22:10:34Z;Júlio;4610.955;115586504;Germany; -2018-12-06T21:59:50Z;Nathália;4417.866;115079280;Germany; -2018-12-06T21:54:13Z;Júlio;2739.618;114972398;Germany; -\end{verbatim} - -This is usually the standard adopted by countries that uses a comma to -define decimal places in real numbers. In other words, in some -countries, the number \texttt{3.45} is usually written as \texttt{3,45}. - -Anyway, we know now that the \texttt{transf\_reform.csv} file uses a -different separator character, so, to correctly read this CSV file into -Spark, we need to set the \texttt{sep} import option. Since this file -comes with the column names in the first line, I also set the -\texttt{header} import option to read this first line as the column -names as well. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"sep"}\NormalTok{, }\StringTok{";"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"header"}\NormalTok{, }\VariableTok{True}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"../Data/transf\_reform.csv"}\NormalTok{)} - -\NormalTok{transf.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+--------+--------+----------+-------+ -| datetime| user| value|transferid|country| -+--------------------+--------+--------+----------+-------+ -|2018-12-06T22:19:19Z| Eduardo|598.5984| 116241629|Germany| -|2018-12-06T22:10:34Z| Júlio|4610.955| 115586504|Germany| -|2018-12-06T21:59:50Z|Nathália|4417.866| 115079280|Germany| -|2018-12-06T21:54:13Z| Júlio|2739.618| 114972398|Germany| -|2018-12-06T21:41:27Z| Ana|1408.261| 116262934|Germany| -+--------------------+--------+--------+----------+-------+ -only showing top 5 rows -... with 1 more columns: description -\end{verbatim} - -\section{Setting the encoding of the -file}\label{setting-the-encoding-of-the-file} - -Spark will always assume that your static files use the UTF-8 encoding -system. But, that might not be the case for your specific file. In this -situation, you have to tell Spark which is the appropriate encoding -system to be used while reading the file. This \texttt{encoding} import -option is available both for CSV and JSON files. - -To do this, you can set the \texttt{encoding} import option, with the -name of the encoding system to be used. As an example, lets look at the -file \texttt{books.txt}, which is a CSV file encoded with the ISO-8859-1 -system (i.e.~the Latin 1 system). - -If we use the defaults in Spark, you can see in the result below that -some characters in the \texttt{Title} column are not correctly -interpreted. Remember, this problem occurs because of a mismatch in -encoding systems. Spark thinks \texttt{books.txt} is using the UTF-8 -system, but, in reality, it uses the ISO-8859-1 system: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{books }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"header"}\NormalTok{, }\VariableTok{True}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"../Data/books.txt"}\NormalTok{)} - -\NormalTok{books.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+--------------------+------+ -| Title| Author| Price| -+--------------------+--------------------+------+ -| O Hobbit| J. R. R. Tolkien| 40.72| -|Matem�tica para E...|Carl P. Simon and...|139.74| -|Microeconomia: um...| Hal R. Varian| 141.2| -| A Luneta �mbar| Philip Pullman| 42.89| -+--------------------+--------------------+------+ -\end{verbatim} - -But if we tell Spark to use the ISO-8859-1 system while reading the -file, then, all problems are solved, and all characters in the file are -correctly interpreted, as you see in the result below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{books }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"header"}\NormalTok{, }\VariableTok{True}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"encoding"}\NormalTok{, }\StringTok{"ISO{-}8859{-}1"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"../Data/books.txt"}\NormalTok{)} - -\NormalTok{books.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+--------------------+------+ -| Title| Author| Price| -+--------------------+--------------------+------+ -| O Hobbit| J. R. R. Tolkien| 40.72| -|Matemática para E...|Carl P. Simon and...|139.74| -|Microeconomia: um...| Hal R. Varian| 141.2| -| A Luneta Âmbar| Philip Pullman| 42.89| -+--------------------+--------------------+------+ -\end{verbatim} - -\section{Setting the format of dates and -timestamps}\label{setting-the-format-of-dates-and-timestamps} - -The format that humans write dates and timestamps vary drastically over -the world. By default, Spark will assume that the dates and timestamps -stored in your file are in the format described by the ISO-8601 -standard. That is, the ``YYYY-mm-dd'', or, ``year-month-day'' format. - -But this standard might not be the case for your file. For example: the -brazilian people usually write dates in the format ``dd/mm/YYYY'', or, -``day/month/year''; some parts of Spain write dates in the format -``YYYY/dd/mm'', or, ``year/day/month''; on Nordic countries -(i.e.~Sweden, Finland) dates are written in ``YYYY.mm.dd'' format. - -Every format of a date or timestamp is defined by using a string with -the codes of each part of the date/timestamp, like the letter `Y' which -represents a 4-digit year, or the letter `d' which represents a 2-digit -day. You can see the complete list of codes and their description in the -official documentation of Spark\footnote{\url{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}}. - -As an example, lets look into the \texttt{user-events.json} file. We can -see that the dates and timestamps in this file are using the -``dd/mm/YYYY'' and ``dd/mm/YYYY HH:mm:ss'' formats respectively. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{peek\_file(}\StringTok{"../Data/user{-}events.json"}\NormalTok{, n\_lines}\OperatorTok{=}\DecValTok{3}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -{"dateOfEvent":"15/06/2022","timeOfEvent":"15/06/2022 14:33:10"," -userId":"b902e51e-d043-4a66-afc4-a820173e1bb4","nameOfEvent":"ent -ry"} -{"dateOfEvent":"15/06/2022","timeOfEvent":"15/06/2022 14:40:08"," -userId":"b902e51e-d043-4a66-afc4-a820173e1bb4","nameOfEvent":"cli -ck: shop"} -{"dateOfEvent":"15/06/2022","timeOfEvent":"15/06/2022 15:48:41"," -userId":"b902e51e-d043-4a66-afc4-a820173e1bb4","nameOfEvent":"sel -ect: payment-method"} -\end{verbatim} - -Date variables are usually interpreted by Spark as string variables. In -other words, Spark usually do not convert data that contains dates to -the date type of Spark. In order to Spark - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StructType, StructField} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ DateType, StringType, TimestampType} - -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{, DateType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{, TimestampType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}userId\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}nameOfEvent\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{)} -\NormalTok{])} - -\NormalTok{user\_events }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"dateFormat"}\NormalTok{, }\StringTok{"d/M/y"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"timestampFormat"}\NormalTok{, }\StringTok{"d/M/y k:m:s"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .json(}\StringTok{"../Data/user{-}events.json"}\NormalTok{, schema }\OperatorTok{=}\NormalTok{ schema)} - -\NormalTok{user\_events.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+-------------------+--------------------+ -|dateOfEvent| timeOfEvent| userId| -+-----------+-------------------+--------------------+ -| 2022-06-15|2022-06-15 14:33:10|b902e51e-d043-4a6...| -| 2022-06-15|2022-06-15 14:40:08|b902e51e-d043-4a6...| -| 2022-06-15|2022-06-15 15:48:41|b902e51e-d043-4a6...| -+-----------+-------------------+--------------------+ -... with 1 more columns: nameOfEvent -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{\texorpdfstring{Working with SQL in -\texttt{pyspark}}{Working with SQL in pyspark}}\label{sec-dataframe-sql-chapter} - -As we discussed in Chapter~\ref{sec-introd-spark}, Spark is a -\textbf{multi-language} engine for large-scale data processing. This -means that we can build our Spark application using many different -languages (like Java, Scala, Python and R). Furthermore, you can also -use the Spark SQL module of Spark to translate all of your -transformations into pure SQL queries. - -In more details, Spark SQL is a Spark module for structured data -processing (\emph{Apache Spark Official Documentation} 2022). Because -this module works with Spark DataFrames, using SQL, you can translate -all transformations that you build with the DataFrame API into a SQL -query. - -Therefore, you can mix python code with SQL queries very easily in -Spark. Virtually all transformations exposed in python throughout this -book, can be translated into a SQL query using this module of Spark. We -will focus a lot on this exchange between Python and SQL in this -chapter. - -However, this also means that the Spark SQL module does not handle the -transformations produced by the unstructured APIs of Spark, i.e.~the -Dataset API. Since the Dataset API is not available in \texttt{pyspark}, -it is not covered in this book. - -\section{\texorpdfstring{The \texttt{sql()} method as the main -entrypoint}{The sql() method as the main entrypoint}}\label{the-sql-method-as-the-main-entrypoint} - -The main entrypoint, that is, the main bridge that connects Spark SQL to -Python is the \texttt{sql()} method of your Spark Session. This method -accepts a SQL query inside a string as input, and will always output a -new Spark DataFrame as result. That is why I used the \texttt{show()} -method right after \texttt{sql()}, in the example below, to see what -this new Spark DataFrame looked like. - -As a first example, lets look at a very basic SQL query, that just -select a list of code values: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{SELECT} \OperatorTok{*} -\KeywordTok{FROM}\NormalTok{ (} - \KeywordTok{VALUES}\NormalTok{ (}\DecValTok{11}\NormalTok{), (}\DecValTok{31}\NormalTok{), (}\DecValTok{24}\NormalTok{), (}\DecValTok{35}\NormalTok{)} -\NormalTok{) }\KeywordTok{AS} \KeywordTok{List}\NormalTok{(Codes)} -\end{Highlighting} -\end{Shaded} - -To run the above SQL query, and see its results, I must write this query -into a string, and give this string to the \texttt{sql()} method of my -Spark Session. Then, I use the \texttt{show()} action to see the actual -result rows of data generated by this query: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sql\_query }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{SELECT *} -\StringTok{FROM (} -\StringTok{ VALUES (11), (31), (24), (35)} -\StringTok{) AS List(Codes)} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{spark.sql(sql\_query).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+ -|Codes| -+-----+ -| 11| -| 31| -| 24| -| 35| -+-----+ -\end{verbatim} - -If you want to execute a very short SQL query, is fine to write it -inside a single pair of quotation marks (for example -\texttt{"SELECT\ *\ FROM\ sales.per\_day"}). However, since SQL queries -usually take multiple lines, you can write your SQL query inside a -python docstring (created by a pair of three quotation marks), like in -the example above. - -Having this in mind, every time you want to execute a SQL query, you can -use this \texttt{sql()} method from the object that holds your Spark -Session. So the \texttt{sql()} method is the bridge between -\texttt{pyspark} and SQL. You give it a pure SQL query inside a string, -and, Spark will execute it, considering your Spark SQL context. - -\subsection{A single SQL statement per -run}\label{a-single-sql-statement-per-run} - -Is worth pointing out that, although being the main bridge between -Python and SQL, the Spark Session \texttt{sql()} method can execute only -a single SQL statement per run. This means that if you try to execute -two sequential SQL statements at the same time with \texttt{sql()}, -then, Spark SQL will automatically raise a \texttt{ParseException} -error, which usually complains about an ``extra input''. - -In the example below, we are doing two very basic steps to SQL. We first -create a dummy database with a \texttt{CREATE\ DATABASE} statement, -then, we ask SQL to use this new database that we created as the default -database of the current session, with a \texttt{USE} statement. - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{CREATE} \KeywordTok{DATABASE}\NormalTok{ \textasciigrave{}dummy\textasciigrave{};} -\KeywordTok{USE}\NormalTok{ \textasciigrave{}dummy\textasciigrave{};} -\end{Highlighting} -\end{Shaded} - -If we try to execute these two steps at once, by using the -\texttt{sql()} method, Spark complains with a \texttt{ParseException}, -indicating that we have a sytax error in our query, like in the example -below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{query }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{CREATE DATABASE \textasciigrave{}dummy\textasciigrave{};} -\StringTok{USE \textasciigrave{}dummy\textasciigrave{};} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{spark.sql(query).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Traceback (most recent call last): - File "", line 1, in - File "/opt/spark/python/pyspark/sql/session.py", line 1034, in -sql - return DataFrame(self._jsparkSession.sql(sqlQuery), self) - File "/opt/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gat -eway.py", line 1321, in __call__ - File "/opt/spark/python/pyspark/sql/utils.py", line 196, in dec -o - raise converted from None -pyspark.sql.utils.ParseException: -Syntax error at or near 'USE': extra input 'USE'(line 3, pos 0) - -== SQL == - -CREATE DATABASE `dummy`; -USE `dummy`; -^^^ -\end{verbatim} - -However, there is nothing wrong about the above SQL statements. They are -both correct and valid SQL statements, both semantically and -syntactically. In other words, the case above results in a -\texttt{ParseException} error solely because it contains two different -SQL statements. - -In essence, the \texttt{spark.sql()} method always expect a single SQL -statement as input, and, therefore, it will try to parse this input -query as a single SQL statement. If it finds multiple SQL statements -inside this input string, the method will automatically raise the above -error. - -Now, be aware that some SQL queries can take multiple lines, but, -\textbf{still be considered a single SQL statement}. A query started by -a \texttt{WITH} clause is usually a good example of a SQL query that can -group multiple \texttt{SELECT} statements, but still be considered a -single SQL statement as a whole: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{{-}{-} The query below would execute} -\CommentTok{{-}{-} perfectly fine inside spark.sql():} -\KeywordTok{WITH}\NormalTok{ table1 }\KeywordTok{AS}\NormalTok{ (} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ somewhere} -\NormalTok{),} - -\NormalTok{filtering }\KeywordTok{AS}\NormalTok{ (} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ table1} - \KeywordTok{WHERE}\NormalTok{ dateOfTransaction }\OperatorTok{==} \FunctionTok{CAST}\NormalTok{(}\OtherTok{"2022{-}02{-}02"} \KeywordTok{AS} \DataTypeTok{DATE}\NormalTok{)} -\NormalTok{)} - -\KeywordTok{SELECT} \OperatorTok{*} -\KeywordTok{FROM}\NormalTok{ filtering} -\end{Highlighting} -\end{Shaded} - -Another example of a usually big and complex query, that can take -multiple lines but still be considered a single SQL statement, is a -single \texttt{SELECT} statement that selects multiple subqueries that -are nested together, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{{-}{-} The query below would also execute} -\CommentTok{{-}{-} perfectly fine inside spark.sql():} -\KeywordTok{SELECT} \OperatorTok{*} -\KeywordTok{FROM}\NormalTok{ (} - \CommentTok{{-}{-} First subquery.} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ (} - \CommentTok{{-}{-} Second subquery..} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ (} - \CommentTok{{-}{-} Third subquery...} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ (} - \CommentTok{{-}{-} Ok this is enough....} -\NormalTok{ )} -\NormalTok{ )} -\NormalTok{ )} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -However, if we had multiple separate \texttt{SELECT} statements that -were independent on each other, like in the example below, then, -\texttt{spark.sql()} would issue an \texttt{ParseException} error if we -tried to execute these three \texttt{SELECT} statements inside the same -input string. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{{-}{-} These three statements CANNOT be executed} -\CommentTok{{-}{-} at the same time inside spark.sql()} -\KeywordTok{SELECT} \OperatorTok{*} \KeywordTok{FROM}\NormalTok{ something;} -\KeywordTok{SELECT} \OperatorTok{*} \KeywordTok{FROM}\NormalTok{ somewhere;} -\KeywordTok{SELECT} \OperatorTok{*} \KeywordTok{FROM}\NormalTok{ sometime;} -\end{Highlighting} -\end{Shaded} - -As a conclusion, if you want to easily execute multiple statements, you -can use a \texttt{for} loop which calls \texttt{spark.sql()} for each -single SQL statement: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{statements }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}SELECT * FROM something;} -\StringTok{SELECT * FROM somewhere;} -\StringTok{SELECT * FROM sometime;\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{statements }\OperatorTok{=}\NormalTok{ statements.split(}\StringTok{\textquotesingle{}}\CharTok{\textbackslash{}n}\StringTok{\textquotesingle{}}\NormalTok{)} -\ControlFlowTok{for}\NormalTok{ statement }\KeywordTok{in}\NormalTok{ statements:} -\NormalTok{ spark.sql(statement)} -\end{Highlighting} -\end{Shaded} - -\section{Creating SQL Tables in -Spark}\label{creating-sql-tables-in-spark} - -In real life jobs at the industry, is very likely that your data will be -allocated inside a SQL-like database. Spark can connect to a external -SQL database through JDBC/ODBC connections, or, read tables from Apache -Hive. This way, you can sent your SQL queries to this external database. - -However, to expose more simplified examples throughout this chapter, we -will use \texttt{pyspark} to create a simple temporary SQL table in our -Spark SQL context, and use this temporary SQL table in our examples of -SQL queries. This way, we avoid the work to connect to some existing SQL -database, and, still get to learn how to use SQL queries in -\texttt{pyspark}. - -First, lets create our Spark Session. You can see below that I used the -\texttt{config()} method to set a specific option of the session called -\texttt{catalogImplementation} to the value \texttt{"hive"}. This option -controls the implementation of the Spark SQL Catalog, which is a core -part of the SQL functionality of Spark \footnote{There are some very - good materials explaining what is the Spark SQL Catalog, and which is - the purpose of it. For a soft introduction, I recommend Sarfaraz - Hussain post: - \url{https://medium.com/@sarfarazhussain211/metastore-in-apache-spark-9286097180a4}. - For a more technical introduction, see - \url{https://jaceklaskowski.gitbooks.io/mastering-spark-sql/content/spark-sql-Catalog.html}.}. - -Spark usually complain with a \texttt{AnalysisException} error when you -try to create SQL tables with this option undefined (or not configured). -So, if you decide to follow the examples of this chapter, please always -set this option right at the start of your script\footnote{You can learn - more about why this specific option is necessary by looking at this - StackOverflow post: - \url{https://stackoverflow.com/questions/50914102/why-do-i-get-a-hive-support-is-required-to-create-hive-table-as-select-error}.}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ SparkSession} -\NormalTok{spark }\OperatorTok{=}\NormalTok{ SparkSession}\OperatorTok{\textbackslash{}} -\NormalTok{ .builder}\OperatorTok{\textbackslash{}} -\NormalTok{ .config(}\StringTok{"spark.sql.catalogImplementation"}\NormalTok{,}\StringTok{"hive"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .getOrCreate()} -\end{Highlighting} -\end{Shaded} - -\subsection{\texorpdfstring{\texttt{TABLEs} versus -\texttt{VIEWs}}{TABLEs versus VIEWs}}\label{tables-versus-views} - -To run a complete SQL query over any Spark DataFrame, you must register -this DataFrame in the Spark SQL Catalog of your Spark Session. You can -register a Spark DataFrame into this catalog as a physical SQL -\texttt{TABLE}, or, as a SQL \texttt{VIEW}. - -If you are familiar with the SQL language and Relational DataBase -Management Systems - RDBMS (such as MySQL), you probably already heard -of these two types (\texttt{TABLE} or \texttt{VIEW}) of SQL objects. But -if not, we will explain each one in this section. Is worth pointing out -that choosing between these two types \textbf{does not affect} your -code, or your transformations in any way. It just affect the way that -Spark SQL stores the table/DataFrame itself. - -\subsubsection{\texorpdfstring{\texttt{VIEWs} are stored as SQL queries -or memory -pointers}{VIEWs are stored as SQL queries or memory pointers}}\label{views-are-stored-as-sql-queries-or-memory-pointers} - -When you register a DataFrame as a SQL \texttt{VIEW}, the query to -produce this DataFrame is stored, not the DataFrame itself. There are -also cases where Spark store a memory pointer instead, that points to -the memory adress where this DataFrame is stored in memory. In this -perspective, Spark SQL use this pointer every time it needs to access -this DataFrame. - -Therefore, when you call (or access) this SQL \texttt{VIEW} inside your -SQL queries (for example, with a \texttt{SELECT\ *\ FROM} statement), -Spark SQL will automatically get this SQL \texttt{VIEW} ``on the fly'' -(or ``on runtime''), by executing the query necessary to build the -initial DataFrame that you stored inside this \texttt{VIEW}, or, if this -DataFrame is already stored in memory, Spark will look at the specific -memory address it is stored. - -In other words, when you create a SQL \texttt{VIEW}, Spark SQL do not -store any physical data or rows of the DataFrame. It just stores the SQL -query necessary to build your DataFrame. In some way, you can interpret -any SQL \texttt{VIEW} as an abbreviation to a SQL query, or a -``nickname'' to an already existing DataFrame. - -As a consequence, for most ``use case scenarios'', SQL \texttt{VIEWs} -are easier to manage inside your data pipelines. Because you usually do -not have to update them. Since they are calculated from scratch, at the -moment you request for them, a SQL \texttt{VIEW} will always translate -the most recent version of the data. - -This means that the concept of a \texttt{VIEW} in Spark SQL is very -similar to the concept of a \texttt{VIEW} in other types of SQL -databases, such as the MySQL database. If you read the -\href{https://dev.mysql.com/doc/refman/8.0/en/create-view.html}{official -documentation for the \texttt{CREATE\ VIEW} statement at -MySQL}\footnote{\url{https://dev.mysql.com/doc/refman/8.0/en/create-view.html}} -you will get a similar idea of a \texttt{VIEW}: - -\begin{quote} -The select\_statement is a SELECT statement that provides the definition -of the view. (Selecting from the view selects, in effect, using the -SELECT statement.) \ldots{} -\end{quote} - -The above statement, tells us that selecing a \texttt{VIEW} causes the -SQL engine to execute the expression defined at -\texttt{select\_statement} using the \texttt{SELECT} statement. In other -words, in MySQL, a SQL \texttt{VIEW} is basically an alias to an -existing \texttt{SELECT} statement. - -\subsubsection{\texorpdfstring{Differences in Spark SQL -\texttt{VIEW}s}{Differences in Spark SQL VIEWs}}\label{differences-in-spark-sql-views} - -Although a Spark SQL \texttt{VIEW} being very similar to other types of -SQL \texttt{VIEW} (such as the MySQL type), on Spark applications, SQL -\texttt{VIEW}s are usually registered as -\texttt{TEMPORARY\ VIEW}s\footnote{I will explain more about the meaning - of ``temporary'' at Section~\ref{sec-temp-persist}.} instead of -standard (and ``persistent'') SQL \texttt{VIEW} as in MySQL. - -At MySQL there is no notion of a ``temporary view'', although other -popular kinds of SQL databases do have it, -\href{https://www.postgresql.org/docs/current/sql-createview.html}{such -as the PostGreSQL database}\footnote{\url{https://www.postgresql.org/docs/current/sql-createview.html}}. -So, a temporary view is not a exclusive concept of Spark SQL. However, -is a special type of SQL \texttt{VIEW} that is not present in all -popular kinds of SQL databases. - -In other words, both Spark SQL and MySQL support the -\texttt{CREATE\ VIEW} statement. In contrast, statements such as -\texttt{CREATE\ TEMPORARY\ VIEW} and -\texttt{CREATE\ OR\ REPLACE\ TEMPORARY\ VIEW} are available in Spark -SQL, but not in MySQL. - -\subsubsection{\texorpdfstring{Registering a Spark SQL \texttt{VIEW} in -the Spark SQL -Catalog}{Registering a Spark SQL VIEW in the Spark SQL Catalog}}\label{registering-a-spark-sql-view-in-the-spark-sql-catalog} - -In \texttt{pyspark}, you can register a Spark DataFrame as a temporary -SQL \texttt{VIEW} with the \texttt{createTempView()} or -\texttt{createOrReplaceTempView()} DataFrame methods. These methods are -equivalent to \texttt{CREATE\ TEMPORARY\ VIEW} and -\texttt{CREATE\ OR\ REPLACE\ TEMPORARY\ VIEW} SQL statements of Spark -SQL, respectively. - -In essence, these methods register your Spark DataFrame as a temporary -SQL \texttt{VIEW}, and have a single input, which is the name you want -to give to this new SQL \texttt{VIEW} you are creating inside a string: - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# To save the \textasciigrave{}df\textasciigrave{} DataFrame as a SQL VIEW,} -\CommentTok{\# use one of the methods below:} -\NormalTok{df.createTempView(}\StringTok{\textquotesingle{}example\_view\textquotesingle{}}\NormalTok{)} -\NormalTok{df.createOrReplaceTempView(}\StringTok{\textquotesingle{}example\_view\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -After we executed the above statements, we can now access and use the -\texttt{df} DataFrame in any SQL query, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sql\_query }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{SELECT *} -\StringTok{FROM example\_view} -\StringTok{WHERE value \textgreater{} 20} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{spark.sql(sql\_query).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+----------+ -| id|value| date| -+---+-----+----------+ -| 1| 28.3|2021-01-01| -| 3| 20.1|2021-01-02| -+---+-----+----------+ -\end{verbatim} - -So you use the \texttt{createTempView()} or -\texttt{createOrReplaceTempView()} methods when you want to make a Spark -DataFrame created in \texttt{pyspark} (that is, a python object), -available to Spark SQL. - -Besides that, you also have the option to create a temporary -\texttt{VIEW} by using pure SQL statements trough the \texttt{sql()} -method. However, when you create a temporary \texttt{VIEW} using pure -SQL, you can only use (inside this \texttt{VIEW}) native SQL objects -that are already stored inside your Spark SQL Context. - -This means that you cannot make a Spark DataFrame created in python -available to Spark SQL, by using a pure SQL inside the \texttt{sql()} -method. To do this, you have to use the DataFrame methods -\texttt{createTempView()} and \texttt{createOrReplaceTempView()}. - -As an example, the query below uses pure SQL statements to creates the -\texttt{active\_brazilian\_users} temporary \texttt{VIEW}, which selects -an existing SQL table called \texttt{hubspot.user\_mails}: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{CREATE} \KeywordTok{TEMPORARY} \KeywordTok{VIEW}\NormalTok{ active\_brazilian\_users }\KeywordTok{AS} -\KeywordTok{SELECT} \OperatorTok{*} -\KeywordTok{FROM}\NormalTok{ hubspot.user\_mails} -\KeywordTok{WHERE}\NormalTok{ state }\OperatorTok{==} \StringTok{\textquotesingle{}Active\textquotesingle{}} -\KeywordTok{AND}\NormalTok{ country\_location }\OperatorTok{==} \StringTok{\textquotesingle{}Brazil\textquotesingle{}} -\end{Highlighting} -\end{Shaded} - -Temporary \texttt{VIEW}s like the one above (which are created from pure -SQL statements being executed inside the \texttt{sql()} method) are kind -of unusual in Spark SQL. Because you can easily avoid the work of -creating a \texttt{VIEW} by using Common Table Expression (CTE) on a -\texttt{WITH} statement, like in the query below: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{WITH}\NormalTok{ active\_brazilian\_users }\KeywordTok{AS}\NormalTok{ (} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ hubspot.user\_mails} - \KeywordTok{WHERE}\NormalTok{ state }\OperatorTok{==} \StringTok{\textquotesingle{}Active\textquotesingle{}} - \KeywordTok{AND}\NormalTok{ country\_location }\OperatorTok{==} \StringTok{\textquotesingle{}Brazil\textquotesingle{}} -\NormalTok{)} - -\KeywordTok{SELECT}\NormalTok{ A.}\FunctionTok{user}\NormalTok{, }\FunctionTok{SUM}\NormalTok{(sale\_value), B.user\_email} -\KeywordTok{FROM}\NormalTok{ sales.sales\_per\_user }\KeywordTok{AS}\NormalTok{ A} -\KeywordTok{INNER} \KeywordTok{JOIN}\NormalTok{ active\_brazilian\_users }\KeywordTok{AS}\NormalTok{ B} -\KeywordTok{GROUP} \KeywordTok{BY}\NormalTok{ A.}\FunctionTok{user}\NormalTok{, B.user\_email} -\end{Highlighting} -\end{Shaded} - -Just as a another example, you can also run a SQL query that creates a -persistent SQL \texttt{VIEW} (that is, without the \texttt{TEMPORARY} -clause). In the example below, I am saving the simple query that I -showed at the beginning of this chapter inside a \texttt{VIEW} called -\texttt{list\_of\_codes}. This \texttt{CREATE\ VIEW} statement, register -a persistent SQL \texttt{VIEW} in the SQL Catalog. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sql\_query }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{CREATE OR REPLACE VIEW list\_of\_codes AS} -\StringTok{SELECT *} -\StringTok{FROM (} -\StringTok{ VALUES (11), (31), (24), (35)} -\StringTok{) AS List(Codes)} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{spark.sql(sql\_query)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -DataFrame[] -\end{verbatim} - -Now, every time I want to use this SQL query that selects a list of -codes, I can use this \texttt{list\_of\_codes} as a shortcut: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark.sql(}\StringTok{"SELECT * FROM list\_of\_codes"}\NormalTok{).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+ -|Codes| -+-----+ -| 11| -| 31| -| 24| -| 35| -+-----+ -\end{verbatim} - -\subsubsection{\texorpdfstring{\texttt{TABLEs} are stored as physical -tables}{TABLEs are stored as physical tables}}\label{tables-are-stored-as-physical-tables} - -In the other hand, SQL \texttt{TABLEs} are the ``opposite'' of SQL -\texttt{VIEWs}. That is, SQL \texttt{TABLEs} are stored as physical -tables inside the SQL database. In other words, each one of the rows of -your table are stored inside the SQL database. - -Because of this characteristic, when dealing with huges amounts of data, -SQL \texttt{TABLEs} are usually faster to load and transform. Because -you just have to read the data stored on the database, you do not need -to calculate it from scratch every time you use it. - -But, as a collateral effect, you usually have to physically update the -data inside this \texttt{TABLE}, by using, for example, -\texttt{INSERT\ INTO} statements. In other words, when dealing with SQL -\texttt{TABLE}'s you usually need to create (and manage) data pipelines -that are responsible for periodically update and append new data to this -SQL \texttt{TABLE}, and this might be a big burden to your process. - -\subsubsection{\texorpdfstring{Registering Spark SQL \texttt{TABLE}s in -the Spark SQL -Catalog}{Registering Spark SQL TABLEs in the Spark SQL Catalog}}\label{registering-spark-sql-tables-in-the-spark-sql-catalog} - -In \texttt{pyspark}, you can register a Spark DataFrame as a SQL -\texttt{TABLE} with the \texttt{write.saveAsTable()} DataFrame method. -This method accepts, as first input, the name you want to give to this -SQL \texttt{TABLE} inside a string. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# To save the \textasciigrave{}df\textasciigrave{} DataFrame as a SQL TABLE:} -\NormalTok{df.write.saveAsTable(}\StringTok{\textquotesingle{}example\_table\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -As you expect, after we registered the DataFrame as a SQL table, we can -now run any SQL query over \texttt{example\_table}, like in the example -below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark.sql(}\StringTok{"SELECT SUM(value) FROM example\_table"}\NormalTok{).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+ -|sum(value)| -+----------+ -| 76.8| -+----------+ -\end{verbatim} - -You can also use pure SQL queries to create an empty SQL \texttt{TABLE} -from scratch, and then, feed this table with data by using -\texttt{INSERT\ INTO} statements. In the example below, we create a new -database called \texttt{examples}, and, inside of it, a table called -\texttt{code\_brazil\_states}. Then, we use multiple -\texttt{INSERT\ INTO} statements to populate this table with few rows of -data. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{all\_statements }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}CREATE DATABASE \textasciigrave{}examples\textasciigrave{};} -\StringTok{USE \textasciigrave{}examples\textasciigrave{};} -\StringTok{CREATE TABLE \textasciigrave{}code\_brazil\_states\textasciigrave{} (\textasciigrave{}code\textasciigrave{} INT, \textasciigrave{}state\_name\textasciigrave{} STRING);} -\StringTok{INSERT INTO \textasciigrave{}code\_brazil\_states\textasciigrave{} VALUES (31, "Minas Gerais");} -\StringTok{INSERT INTO \textasciigrave{}code\_brazil\_states\textasciigrave{} VALUES (15, "Pará");} -\StringTok{INSERT INTO \textasciigrave{}code\_brazil\_states\textasciigrave{} VALUES (41, "Paraná");} -\StringTok{INSERT INTO \textasciigrave{}code\_brazil\_states\textasciigrave{} VALUES (25, "Paraíba");\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\NormalTok{statements }\OperatorTok{=}\NormalTok{ all\_statements.split(}\StringTok{\textquotesingle{}}\CharTok{\textbackslash{}n}\StringTok{\textquotesingle{}}\NormalTok{)} -\ControlFlowTok{for}\NormalTok{ statement }\KeywordTok{in}\NormalTok{ statements:} -\NormalTok{ spark.sql(statement)} -\end{Highlighting} -\end{Shaded} - -We can see now this new physical SQL table using a simple query like -this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark}\OperatorTok{\textbackslash{}} -\NormalTok{ .sql(}\StringTok{\textquotesingle{}SELECT * FROM examples.code\_brazil\_states\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----+------------+ -|code| state_name| -+----+------------+ -| 41| Paraná| -| 31|Minas Gerais| -| 15| Pará| -| 25| Paraíba| -+----+------------+ -\end{verbatim} - -\subsubsection{The different save ``modes''}\label{sec-sql-save-modes} - -There are other arguments that you might want to use in the -\texttt{write.saveAsTable()} method, like the \texttt{mode} argument. -This argument controls how Spark will save your data into the database. -By default, \texttt{write.saveAsTable()} uses the -\texttt{mode\ =\ "error"} by default. In this mode, Spark will look if -the table you referenced already exists, before it saves your data. - -Let's get back to the code we showed before (which is reproduced below). -In this code, we asked Spark to save our data into a table called -\texttt{"example\_table"}. Spark will look if a table with this name -already exists. If it does, then, Spark will raise an error that will -stop the process (i.e.~no data is saved). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.write.saveAsTable(}\StringTok{\textquotesingle{}example\_table\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Raising an error when you do not want to accidentaly affect a SQL table -that already exist, is a good practice. But, you might want to not raise -an error in this situation. In case like this, you might want to just -ignore the operation, and get on with your life. For cases like this, -\texttt{write.saveAsTable()} offers the \texttt{mode\ =\ "ignore"}. - -So, in the code example below, we are trying to save the \texttt{df} -DataFrame into a table called \texttt{example\_table}. But if this -\texttt{example\_table} already exist, Spark will just silently ignore -this operation, and will not save any data. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.write.saveAsTable(}\StringTok{\textquotesingle{}example\_table\textquotesingle{}}\NormalTok{, mode }\OperatorTok{=} \StringTok{"ignore"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -In addition, \texttt{write.saveAsTable()} offers two more different -modes, which are \texttt{mode\ =\ "overwrite"} and -\texttt{mode\ =\ "append"}. When you use one these two modes, Spark -\textbf{will always save your data}, no matter if the SQL table you are -trying to save into already exist or not. In essence, these two modes -control whether Spark will delete or keep previous rows of the SQL table -intact, before it saves any new data. - -When you use \texttt{mode\ =\ "overwrite"}, Spark will automatically -rewrite/replace the entire table with the current data of your -DataFrame. In contrast, when you use \texttt{mode\ =\ "append"}, Spark -will just append (or insert, or add) this data into the table. The -subfigures at Figure~\ref{fig-save-table-modes} demonstrates these two -modes visually. - -\begin{figure} - -\begin{minipage}{\linewidth} - -\centering{ - -\includegraphics{Chapters/./../Figures/table-save-modes-overwrite.png} - -} - -\subcaption{\label{fig-mode-overwrite}Mode overwrite} - -\end{minipage}% -\newline -\begin{minipage}{\linewidth} - -\centering{ - -\includegraphics{Chapters/./../Figures/table-save-modes-append.png} - -} - -\subcaption{\label{fig-mode-append}Mode append} - -\end{minipage}% - -\caption{\label{fig-save-table-modes}How Spark saves your data with -different ``save modes''} - -\end{figure}% - -You can see the full list of arguments of \texttt{write.SaveAsTable()}, -and their description by -\href{https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.DataFrameWriter.saveAsTable}{looking -at the documentation}\footnote{\url{https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.sql.DataFrameWriter.saveAsTable}}. - -\subsection{Temporary versus Persistent sources}\label{sec-temp-persist} - -When you register any Spark DataFrame as a SQL \texttt{TABLE}, it -becomes a persistent source. Because the contents, the data, the rows of -the table are stored on disk, inside a database, and can be accessed any -time, even after you close or restart your computer (or your Spark -Session). In other words, it becomes ``persistent'' as in the sense of -``it does not die''. - -As another example, when you save a specific SQL query as a SQL -\texttt{VIEW} with the \texttt{CREATE\ VIEW} statement, this SQL -\texttt{VIEW} is saved inside the database. As a consequence, it becomes -a persistent source as well, and can be accessed and reused in other -Spark Sessions, unless you explicit drop (or ``remove'') this SQL -\texttt{VIEW} with a \texttt{DROP\ VIEW} statement. - -However, with methods like \texttt{createTempView()} and -\texttt{createOrReplaceTempView()} you register your Spark DataFrame as -a \emph{temporary} SQL \texttt{VIEW}. This means that the life (or time -of existence) of this \texttt{VIEW} is tied to your Spark Session. In -other words, it will exist in your Spark SQL Catalog only for the -duration of your Spark Session. When you close your Spark Session, this -\texttt{VIEW} just dies. When you start a new Spark Session it does not -exist anymore. As a result, you have to register your DataFrame again at -the catalog to use it one more time. - -\subsection{\texorpdfstring{Spark SQL Catalog is the bridge between SQL -and -\texttt{pyspark}}{Spark SQL Catalog is the bridge between SQL and pyspark}}\label{spark-sql-catalog-is-the-bridge-between-sql-and-pyspark} - -Remember, to run SQL queries over any Spark DataFrame, you must register -this DataFrame into the Spark SQL Catalog. Because of it, this Spark SQL -Catalog works almost as the bridge that connects the python objects that -hold your Spark DataFrames to the Spark SQL context. Without it, Spark -SQL will not find your Spark DataFrames. As a result, it can not run any -SQL query over it. - -When you try to use a DataFrame that is not currently registered at the -Spark SQL Catalog, Spark will automatically raise a -\texttt{AnalysisException}, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark}\OperatorTok{\textbackslash{}} -\NormalTok{ .sql(}\StringTok{"SELECT * FROM this.does\_not\_exist"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AnalysisException: Table or view not found -\end{verbatim} - -The methods \texttt{saveAsTable()}, \texttt{createTempView()} and -\texttt{createOrReplaceTempView()} are the main methods to register your -Spark DataFrame into this Spark SQL Catalog. This means that you have to -use one of these methods before you run any SQL query over your Spark -DataFrame. - -\section{\texorpdfstring{The \texttt{penguins} -DataFrame}{The penguins DataFrame}}\label{the-penguins-dataframe} - -Over the next examples in this chapter, we will explore the -\texttt{penguins} DataFrame. This is the \texttt{penguins} dataset from -the -\href{https://allisonhorst.github.io/palmerpenguins/}{\texttt{palmerpenguins} -R library}\footnote{\url{https://allisonhorst.github.io/palmerpenguins/}}. -It stores data of multiple measurements of penguin species from the -islands in Palmer Archipelago. - -These measurements include size (flipper length, body mass, bill -dimensions) and sex, and they were collected by researchers of the -Antarctica LTER program, a member of the Long Term Ecological Research -Network. If you want to understand more about each field/column present -in this dataset, I recommend you to read the -\href{https://allisonhorst.github.io/palmerpenguins/reference/penguins.html}{official -documentation of this dataset}\footnote{\url{https://allisonhorst.github.io/palmerpenguins/reference/penguins.html}}. - -To get this data, you can download the CSV file called -\texttt{penguins.csv} (remember that this CSV can be downloaded from the -book repository\footnote{\url{https://github.com/pedropark99/Introd-pyspark/tree/main/Data}}). -In the code below, I am reading this CSV file and creating a Spark -DataFrame with its data. Then, I register this Spark DataFrame as a SQL -temporary view (called \texttt{penguins\_view}) using the -\texttt{createOrReplaceTempView()} method. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{path }\OperatorTok{=} \StringTok{"../Data/penguins.csv"} -\NormalTok{penguins }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(path, header }\OperatorTok{=} \VariableTok{True}\NormalTok{)} - -\NormalTok{penguins.createOrReplaceTempView(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -After these commands, I have now a SQL view called -\texttt{penguins\_view} registered in my Spark SQL context, which I can -query it, using pure SQL: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark.sql(}\StringTok{\textquotesingle{}SELECT * FROM penguins\_view\textquotesingle{}}\NormalTok{).show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+---------+--------------+-------------+-----------------+ -|species| island|bill_length_mm|bill_depth_mm|flipper_length_mm| -+-------+---------+--------------+-------------+-----------------+ -| Adelie|Torgersen| 39.1| 18.7| 181| -| Adelie|Torgersen| 39.5| 17.4| 186| -| Adelie|Torgersen| 40.3| 18| 195| -| Adelie|Torgersen| NULL| NULL| NULL| -| Adelie|Torgersen| 36.7| 19.3| 193| -+-------+---------+--------------+-------------+-----------------+ -only showing top 5 rows -... with 3 more columns: body_mass_g, sex, year -\end{verbatim} - -\section{Selecting your Spark -DataFrames}\label{selecting-your-spark-dataframes} - -An obvious way to access any SQL \texttt{TABLE} or \texttt{VIEW} -registered in your Spark SQL context, is to select it, through a simple -\texttt{SELECT\ *\ FROM} statement, like we saw in the previous -examples. However, it can be quite annoying to type ``SELECT * FROM'' -every time you want to use a SQL \texttt{TABLE} or \texttt{VIEW} in -Spark SQL. - -That is why Spark offers a shortcut to us, which is the \texttt{table()} -method of your Spark session. In other words, the code -\texttt{spark.table("table\_name")} is a shortcut to -\texttt{spark.sql("SELECT\ *\ FROM\ table\_name")}. They both mean the -same thing. For example, we could access \texttt{penguins\_view} as: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark}\OperatorTok{\textbackslash{}} -\NormalTok{ .table(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+---------+--------------+-------------+-----------------+ -|species| island|bill_length_mm|bill_depth_mm|flipper_length_mm| -+-------+---------+--------------+-------------+-----------------+ -| Adelie|Torgersen| 39.1| 18.7| 181| -| Adelie|Torgersen| 39.5| 17.4| 186| -| Adelie|Torgersen| 40.3| 18| 195| -| Adelie|Torgersen| NULL| NULL| NULL| -| Adelie|Torgersen| 36.7| 19.3| 193| -+-------+---------+--------------+-------------+-----------------+ -only showing top 5 rows -... with 3 more columns: body_mass_g, sex, year -\end{verbatim} - -\section{Executing SQL expressions}\label{executing-sql-expressions} - -As I noted at Section~\ref{sec-columns-related-expressions}, columns of -a Spark DataFrame (or objects of class \texttt{Column}) are closely -related to expressions. As a result, you usually use and execute -expressions in Spark when you want to transform (or mutate) columns of a -Spark DataFrame. - -This is no different for SQL expressions. A SQL expression is basically -any expression you would use on the \texttt{SELECT} statement of your -SQL query. As you can probably guess, since they are used in the -\texttt{SELECT} statement, these expressions are used to transform -columns of a Spark DataFrame. - -There are many column transformations that are particularly verbose and -expensive to write in ``pure'' \texttt{pyspark}. But you can use a SQL -expression in your favor, to translate this transformation into a more -short and concise form. Virtually any expression you write in -\texttt{pyspark} can be translated into a SQL expression. - -To execute a SQL expression, you give this expression inside a string to -the \texttt{expr()} function from the \texttt{pyspark.sql.functions} -module. Since expressions are used to transform columns, you normally -use the \texttt{expr()} function inside a \texttt{withColumn()} or a -\texttt{select()} DataFrame method, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ expr} - -\NormalTok{spark}\OperatorTok{\textbackslash{}} -\NormalTok{ .table(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}specie\_island\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{"CONCAT(species, \textquotesingle{}\_\textquotesingle{}, island)"}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}sex\_short\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{"CASE WHEN sex == \textquotesingle{}male\textquotesingle{} THEN \textquotesingle{}M\textquotesingle{} ELSE \textquotesingle{}F\textquotesingle{} END"}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}specie\_island\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}sex\_short\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------------+---------+ -| specie_island|sex_short| -+----------------+---------+ -|Adelie_Torgersen| M| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -+----------------+---------+ -only showing top 5 rows -\end{verbatim} - -I particulaly like to write ``if-else'' or ``case-when'' statements -using a pure \texttt{CASE\ WHEN} SQL statement inside the -\texttt{expr()} function. By using this strategy you usually get a more -simple statement that translates the intention of your code in a cleaner -way. But if I wrote the exact same \texttt{CASE\ WHEN} statement above -using pure \texttt{pyspark} functions, I would end up with a shorter -(but ``less clean'') statement using the \texttt{when()} and -\texttt{otherwise()} functions: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ when, col,} -\NormalTok{ concat, lit} -\NormalTok{)} - -\NormalTok{spark}\OperatorTok{\textbackslash{}} -\NormalTok{ .table(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}specie\_island\textquotesingle{}}\NormalTok{,} -\NormalTok{ concat(}\StringTok{\textquotesingle{}species\textquotesingle{}}\NormalTok{, lit(}\StringTok{\textquotesingle{}\_\textquotesingle{}}\NormalTok{), }\StringTok{\textquotesingle{}island\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}sex\_short\textquotesingle{}}\NormalTok{,} -\NormalTok{ when(col(}\StringTok{"sex"}\NormalTok{) }\OperatorTok{==} \StringTok{\textquotesingle{}male\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .otherwise(}\StringTok{\textquotesingle{}F\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}specie\_island\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}sex\_short\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------------+---------+ -| specie_island|sex_short| -+----------------+---------+ -|Adelie_Torgersen| M| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -|Adelie_Torgersen| F| -+----------------+---------+ -only showing top 5 rows -\end{verbatim} - -\section{Every DataFrame transformation in Python can be translated into -SQL}\label{every-dataframe-transformation-in-python-can-be-translated-into-sql} - -All DataFrame API transformations that you write in Python (using -\texttt{pyspark}) can be translated into SQL queries/expressions using -the Spark SQL module. Since the DataFrame API is a core part of -\texttt{pyspark}, the majority of python code you write with -\texttt{pyspark} can be translated into SQL queries (if you wanto to). - -Is worth pointing out, that, no matter which language you choose (Python -or SQL), they are both further compiled to the same base instructions. -The end result is that the Python code you write and his SQL translated -version \textbf{will perform the same} (they have the same efficiency), -because they are compiled to the same instructions before being executed -by Spark. - -\subsection{DataFrame methods are usually translated into SQL -keywords}\label{dataframe-methods-are-usually-translated-into-sql-keywords} - -When you translate the methods from the python \texttt{DataFrame} class -(like \texttt{orderBy()}, \texttt{select()} and \texttt{where()}) into -their equivalents in Spark SQL, you usually get SQL keywords (like -\texttt{ORDER\ BY}, \texttt{SELECT} and \texttt{WHERE}). - -For example, if I needed to get the top 5 penguins with the biggest body -mass at \texttt{penguins\_view}, that had sex equal to -\texttt{"female"}, and, ordered by bill length, I could run the -following python code: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{top\_5 }\OperatorTok{=}\NormalTok{ penguins}\OperatorTok{\textbackslash{}} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \StringTok{\textquotesingle{}female\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(col(}\StringTok{\textquotesingle{}body\_mass\_g\textquotesingle{}}\NormalTok{).desc())}\OperatorTok{\textbackslash{}} -\NormalTok{ .limit(}\DecValTok{5}\NormalTok{)} - -\NormalTok{top\_5}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}bill\_length\_mm\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+------+--------------+-------------+-----------------+ -|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm| -+-------+------+--------------+-------------+-----------------+ -| Gentoo|Biscoe| 44.9| 13.3| 213| -| Gentoo|Biscoe| 45.1| 14.5| 207| -| Gentoo|Biscoe| 45.2| 14.8| 212| -| Gentoo|Biscoe| 46.5| 14.8| 217| -| Gentoo|Biscoe| 49.1| 14.8| 220| -+-------+------+--------------+-------------+-----------------+ -... with 3 more columns: body_mass_g, sex, year -\end{verbatim} - -I could translate the above python code to the following SQL query: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{WITH}\NormalTok{ top\_5 }\KeywordTok{AS}\NormalTok{ (} - \KeywordTok{SELECT} \OperatorTok{*} - \KeywordTok{FROM}\NormalTok{ penguins\_view} - \KeywordTok{WHERE}\NormalTok{ sex }\OperatorTok{==} \StringTok{\textquotesingle{}female\textquotesingle{}} - \KeywordTok{ORDER} \KeywordTok{BY}\NormalTok{ body\_mass\_g }\KeywordTok{DESC} - \KeywordTok{LIMIT} \DecValTok{5} -\NormalTok{)} - -\KeywordTok{SELECT} \OperatorTok{*} -\KeywordTok{FROM}\NormalTok{ top\_5} -\KeywordTok{ORDER} \KeywordTok{BY}\NormalTok{ bill\_length\_mm} -\end{Highlighting} -\end{Shaded} - -Again, to execute the above SQL query inside \texttt{pyspark} we need to -give this query as a string to the \texttt{sql()} method of our Spark -Session, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{query }\OperatorTok{=} \StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} -\StringTok{WITH top\_5 AS (} -\StringTok{ SELECT *} -\StringTok{ FROM penguins\_view} -\StringTok{ WHERE sex == \textquotesingle{}female\textquotesingle{}} -\StringTok{ ORDER BY body\_mass\_g DESC} -\StringTok{ LIMIT 5} -\StringTok{)} - -\StringTok{SELECT *} -\StringTok{FROM top\_5} -\StringTok{ORDER BY bill\_length\_mm} -\StringTok{\textquotesingle{}\textquotesingle{}\textquotesingle{}} - -\CommentTok{\# The same result of the example above} -\NormalTok{spark.sql(query).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+------+--------------+-------------+-----------------+ -|species|island|bill_length_mm|bill_depth_mm|flipper_length_mm| -+-------+------+--------------+-------------+-----------------+ -| Gentoo|Biscoe| 44.9| 13.3| 213| -| Gentoo|Biscoe| 45.1| 14.5| 207| -| Gentoo|Biscoe| 45.2| 14.8| 212| -| Gentoo|Biscoe| 46.5| 14.8| 217| -| Gentoo|Biscoe| 49.1| 14.8| 220| -+-------+------+--------------+-------------+-----------------+ -... with 3 more columns: body_mass_g, sex, year -\end{verbatim} - -\subsection{Spark functions are usually translated into SQL -functions}\label{sec-sql-expr} - -Every function from the \texttt{pyspark.sql.functions} module you might -use to describe your transformations in python, can be directly used in -Spark SQL. In other words, every Spark function that is accesible in -python, is also accesible in Spark SQL. - -When you translate these python functions into SQL, they usually become -a pure SQL function with the same name. For example, if I wanted to use -the \texttt{regexp\_extract()} python function, from the -\texttt{pyspark.sql.functions} module in Spark SQL, I just use the -\texttt{REGEXP\_EXTRACT()} SQL function. The same occurs to any other -function, like the \texttt{to\_date()} function for example. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ to\_date, regexp\_extract} -\CommentTok{\# \textasciigrave{}df1\textasciigrave{} and \textasciigrave{}df2\textasciigrave{} are both equal. Because they both} -\CommentTok{\# use the same \textasciigrave{}to\_date()\textasciigrave{} and \textasciigrave{}regexp\_extract()\textasciigrave{} functions} -\NormalTok{df1 }\OperatorTok{=}\NormalTok{ (spark} -\NormalTok{ .table(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}extract\_number\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_extract(}\StringTok{\textquotesingle{}bill\_length\_mm\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}[0{-}9]+\textquotesingle{}}\NormalTok{, }\DecValTok{0}\NormalTok{)} -\NormalTok{ )} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, to\_date(}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}y\textquotesingle{}}\NormalTok{))} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}bill\_length\_mm\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}extract\_number\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}date\textquotesingle{}} -\NormalTok{ )} -\NormalTok{)} - -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ (spark} -\NormalTok{ .table(}\StringTok{\textquotesingle{}penguins\_view\textquotesingle{}}\NormalTok{)} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}extract\_number\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{"REGEXP\_EXTRACT(bill\_length\_mm, \textquotesingle{}[0{-}9]+\textquotesingle{}, 0)"}\NormalTok{)} -\NormalTok{ )} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, expr(}\StringTok{"TO\_DATE(year, \textquotesingle{}y\textquotesingle{})"}\NormalTok{))} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}bill\_length\_mm\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}extract\_number\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}date\textquotesingle{}} -\NormalTok{ )} -\NormalTok{)} - -\NormalTok{df2.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+----+--------------+----------+ -|bill_length_mm|year|extract_number| date| -+--------------+----+--------------+----------+ -| 39.1|2007| 39|2007-01-01| -| 39.5|2007| 39|2007-01-01| -| 40.3|2007| 40|2007-01-01| -| NULL|2007| NULL|2007-01-01| -| 36.7|2007| 36|2007-01-01| -+--------------+----+--------------+----------+ -only showing top 5 rows -\end{verbatim} - -This is very handy. Because for every new python function from the -\texttt{pyspark.sql.functions} module, that you learn how to use, you -automatically learn how to use in Spark SQL as well, because is the same -function, with the basically the same name and arguments. - -As an example, I could easily translate the above transformations that -use the \texttt{to\_date()} and \texttt{regexp\_extract()} python -functions, into the following SQL query (that I could easily execute -trough the \texttt{sql()} Spark Session method): - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{SELECT} -\NormalTok{ bill\_length\_mm, }\DataTypeTok{year}\NormalTok{,} -\NormalTok{ REGEXP\_EXTRACT(bill\_length\_mm, }\StringTok{\textquotesingle{}[0{-}9]+\textquotesingle{}}\NormalTok{, }\DecValTok{0}\NormalTok{) }\KeywordTok{AS}\NormalTok{ extract\_number,} - \FunctionTok{TO\_DATE}\NormalTok{(}\DataTypeTok{year}\NormalTok{, }\StringTok{\textquotesingle{}y\textquotesingle{}}\NormalTok{) }\KeywordTok{AS} \DataTypeTok{date} -\KeywordTok{FROM}\NormalTok{ penguins\_view} -\end{Highlighting} -\end{Shaded} - -\bookmarksetup{startatroot} - -\chapter{Transforming your Spark DataFrame - Part -2}\label{transforming-your-spark-dataframe---part-2} - -At Chapter~\ref{sec-transforming-dataframes-part1} I introduced six core -types of transformations over Spark DataFrames. In this chapter, I will -expand your knowledge by introducing six more commom types of -transformations available to Spark DataFrames, which are: - -\begin{itemize} -\tightlist -\item - Replacing null values; -\item - Removing duplicated values; -\item - Merging multiple DataFrames with UNION operations; -\item - Merging multiple DataFrames with JOIN operations; -\item - Rows to columns and columns to rows with Pivot operations; -\item - Collecting and explode operations; -\end{itemize} - -\section{Removing duplicated values from your -DataFrame}\label{sec-remove-duplicates} - -Removing duplicated values from DataFrames is a very commom operation in -ETL pipelines. In \texttt{pyspark} you have two options to remove -duplicated values, which are: - -\begin{itemize} -\tightlist -\item - \texttt{distinct()} which removes all duplicated values considering - the combination of all current columns in the DataFrame; -\item - \texttt{drop\_duplicates()} or \texttt{dropDuplicates()} which removes - all duplicated values considering a specific combination (or set) of - columns in the DataFrame; -\end{itemize} - -These three methods above are all DataFrames methods. Furthermore, the -methods \texttt{drop\_duplicates()} and \texttt{dropDuplicates()} are -equivalent. They both mean the same thing, and have the same arguments -and perform the same operation. - -When you run \texttt{drop\_duplicates()} or \texttt{dropDuplicates()} -without any argument, they automatically use by default the combination -of all columns available in the DataFrame to identify the duplicated -values. As a consequence, over this specific situation, the methods -\texttt{drop\_duplicates()} or \texttt{dropDuplicates()} become -equivalent to the \texttt{distinct()} method. Because they use the -combination of all columns in the DataFrame. - -Lets pick the \texttt{supermarket\_sales} DataFrame exposed below as an -example. You can see below, that this DataFrame contains some duplicated -values, specifically on the transaction IDs ``T001'' e ``T004''. We also -have some ``degree of duplication'' on the transaction ID ``T006''. But -the two rows describing this ID ``T006'' are not precisely identical, -since they have a small difference on the \texttt{quantity} column. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ (} -\NormalTok{ StructType, StructField,} -\NormalTok{ StringType, IntegerType,} -\NormalTok{ FloatType} -\NormalTok{)} - -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{"transaction\_id"}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{"product\_name"}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{"quantity"}\NormalTok{, IntegerType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{"price"}\NormalTok{, FloatType(), }\VariableTok{True}\NormalTok{)} -\NormalTok{])} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{"T001"}\NormalTok{, }\StringTok{"Apple"}\NormalTok{, }\DecValTok{5}\NormalTok{, }\FloatTok{1.2}\NormalTok{),} -\NormalTok{ (}\StringTok{"T001"}\NormalTok{, }\StringTok{"Apple"}\NormalTok{, }\DecValTok{5}\NormalTok{, }\FloatTok{1.2}\NormalTok{),} -\NormalTok{ (}\StringTok{"T002"}\NormalTok{, }\StringTok{"Banana"}\NormalTok{, }\DecValTok{3}\NormalTok{, }\FloatTok{0.8}\NormalTok{),} -\NormalTok{ (}\StringTok{"T004"}\NormalTok{, }\StringTok{"Mango"}\NormalTok{, }\DecValTok{2}\NormalTok{, }\FloatTok{2.0}\NormalTok{),} -\NormalTok{ (}\StringTok{"T004"}\NormalTok{, }\StringTok{"Mango"}\NormalTok{, }\DecValTok{2}\NormalTok{, }\FloatTok{2.0}\NormalTok{),} -\NormalTok{ (}\StringTok{"T004"}\NormalTok{, }\StringTok{"Mango"}\NormalTok{, }\DecValTok{2}\NormalTok{, }\FloatTok{2.0}\NormalTok{),} -\NormalTok{ (}\StringTok{"T005"}\NormalTok{, }\StringTok{"Grapes"}\NormalTok{, }\DecValTok{1}\NormalTok{, }\FloatTok{3.5}\NormalTok{),} -\NormalTok{ (}\StringTok{"T006"}\NormalTok{, }\StringTok{"Apple"}\NormalTok{, }\DecValTok{2}\NormalTok{, }\FloatTok{1.2}\NormalTok{),} -\NormalTok{ (}\StringTok{"T006"}\NormalTok{, }\StringTok{"Apple"}\NormalTok{, }\DecValTok{1}\NormalTok{, }\FloatTok{1.2}\NormalTok{),} -\NormalTok{ (}\StringTok{"T007"}\NormalTok{, }\StringTok{"Banana"}\NormalTok{, }\DecValTok{4}\NormalTok{, }\FloatTok{0.8}\NormalTok{),} -\NormalTok{ (}\StringTok{"T008"}\NormalTok{, }\StringTok{"Apple"}\NormalTok{, }\DecValTok{3}\NormalTok{, }\FloatTok{1.2}\NormalTok{)} -\NormalTok{]} - -\NormalTok{supermarket\_sales }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, schema)} -\NormalTok{supermarket\_sales.show(}\DecValTok{6}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - - -\begin{verbatim} -+--------------+------------+--------+-----+ -|transaction_id|product_name|quantity|price| -+--------------+------------+--------+-----+ -| T001| Apple| 5| 1.2| -| T001| Apple| 5| 1.2| -| T002| Banana| 3| 0.8| -| T004| Mango| 2| 2.0| -| T004| Mango| 2| 2.0| -| T004| Mango| 2| 2.0| -+--------------+------------+--------+-----+ -only showing top 6 rows -\end{verbatim} - -We can remove these duplicated values by using the \texttt{distinct()} -method. In the example of transaction ID ``T004'', all duplicated rows -of this ID contains the same values -\texttt{("T004",\ "Mango",\ 2,\ 2.0)}, precisely in this order. Because -of that, the \texttt{distinct()} method is enough to remove all of these -duplicated values from the table. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{supermarket\_sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .distinct()}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{6}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+------------+--------+-----+ -|transaction_id|product_name|quantity|price| -+--------------+------------+--------+-----+ -| T001| Apple| 5| 1.2| -| T002| Banana| 3| 0.8| -| T004| Mango| 2| 2.0| -| T005| Grapes| 1| 3.5| -| T006| Apple| 2| 1.2| -| T006| Apple| 1| 1.2| -+--------------+------------+--------+-----+ -only showing top 6 rows -\end{verbatim} - -However, the two rows describing the transaction ID ``T006'' have some -difference on the \texttt{quantity} column, and as a result, the -\texttt{distinct()} method does not identify these two rows as -``duplicated values'', and they are not removed from the input -DataFrame. - -Now, if we needed a DataFrame that contained one row for each -transaction ID (that is, the values on \texttt{transaction\_id} column -must be unique), we could use the \texttt{drop\_duplicates()} method -with only the column \texttt{transaction\_id} as the key to remove all -duplicated values of this column. This way, we get a slightly different -output as you can see below. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{supermarket\_sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .drop\_duplicates([}\StringTok{\textquotesingle{}transaction\_id\textquotesingle{}}\NormalTok{])}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{8}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+------------+--------+-----+ -|transaction_id|product_name|quantity|price| -+--------------+------------+--------+-----+ -| T001| Apple| 5| 1.2| -| T002| Banana| 3| 0.8| -| T004| Mango| 2| 2.0| -| T005| Grapes| 1| 3.5| -| T006| Apple| 2| 1.2| -| T007| Banana| 4| 0.8| -| T008| Apple| 3| 1.2| -+--------------+------------+--------+-----+ -\end{verbatim} - -In the example above, the duplicated values of IDs ``T001'' and ``T004'' -were removed as we expected. But we also removed the second value for ID -``T006''. Because we did not listed the \texttt{quantity} column on -\texttt{drop\_duplicates()}, and, as a result, the -\texttt{drop\_duplicates()} method was not concerned with the -differences on the \texttt{quantity} column. In other words, it used -solely the \texttt{transaction\_id} column to identify the duplicated -values. - -\section{\texorpdfstring{Other techniques for dealing with \texttt{null} -values}{Other techniques for dealing with null values}}\label{other-techniques-for-dealing-with-null-values} - -At Section~\ref{sec-filter-null-values} I showed how you can use -\texttt{filter()} or \texttt{where()} DataFrame methods to remove all -rows that contained a null value on some column. There are two other -DataFrames methods available in Spark that you might want use to deal -with null values. In essence, you can either remove or replace these -null values. - -\subsection{\texorpdfstring{Replacing \texttt{null} -values}{Replacing null values}}\label{replacing-null-values} - -Instead of removing the null values, and pretending that they never -existed, maybe, you prefer to replace these null values by a more useful -or representative value, such as \texttt{0}, or an empty string -(\texttt{\textquotesingle{}\textquotesingle{}}), or a \texttt{False} -value, etc. To do that in \texttt{pyspark}, we can use the -\texttt{na.fill()} and \texttt{fillna()} DataFrame methods. - -Both methods mean the same thing, and they work the exact same way. The -most popular way of using this methods, is to provide a python dict as -input. Inside this dict you have key-value pairs, where the key -represents the column name, and the value represents the static value -that will replace all null values that are found on the column specified -by the key. - -In the example below, I created a simple \texttt{df} DataFrame which -contains some null values on the \texttt{age} column. By providing the -dict \texttt{\{\textquotesingle{}age\textquotesingle{}:\ 0\}} to -\texttt{fillna()}, I am asking \texttt{fillna()} to replace all null -values found on the \texttt{age} column by the value \texttt{0} (zero). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\StringTok{"John"}\NormalTok{, }\VariableTok{None}\NormalTok{, }\StringTok{"2023{-}04{-}05"}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, }\StringTok{"Alice"}\NormalTok{, }\DecValTok{25}\NormalTok{, }\StringTok{"2023{-}04{-}09"}\NormalTok{),} -\NormalTok{ (}\DecValTok{3}\NormalTok{, }\StringTok{"Bob"}\NormalTok{, }\VariableTok{None}\NormalTok{, }\StringTok{"2023{-}04{-}12"}\NormalTok{),} -\NormalTok{ (}\DecValTok{4}\NormalTok{, }\StringTok{"Jane"}\NormalTok{, }\DecValTok{30}\NormalTok{, }\VariableTok{None}\NormalTok{),} -\NormalTok{ (}\DecValTok{5}\NormalTok{, }\StringTok{"Mike"}\NormalTok{, }\DecValTok{35}\NormalTok{, }\VariableTok{None}\NormalTok{)} -\NormalTok{]} -\NormalTok{columns }\OperatorTok{=}\NormalTok{ [}\StringTok{"id"}\NormalTok{, }\StringTok{"name"}\NormalTok{, }\StringTok{"age"}\NormalTok{, }\StringTok{"date"}\NormalTok{]} -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, columns)} - -\CommentTok{\# Or \textasciigrave{}df.na.fill(\{\textquotesingle{}age\textquotesingle{}: 0\}).show()\textasciigrave{}} -\CommentTok{\# It is the same thing} -\NormalTok{df.fillna(\{}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{: }\DecValTok{0}\NormalTok{\}).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+---+----------+ -| id| name|age| date| -+---+-----+---+----------+ -| 1| John| 0|2023-04-05| -| 2|Alice| 25|2023-04-09| -| 3| Bob| 0|2023-04-12| -| 4| Jane| 30| NULL| -| 5| Mike| 35| NULL| -+---+-----+---+----------+ -\end{verbatim} - -You can see in the above example, that the null values present in the -\texttt{date} column were maintained intact on the result. Because we -did not asked to \texttt{fillna()} to replace the values of this column, -by including it on the input dict that we provided. - -If we do include this \texttt{date} column on the input dict, then, -\texttt{fillna()} will take care of this column as well: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.fillna(\{}\StringTok{\textquotesingle{}age\textquotesingle{}}\NormalTok{: }\DecValTok{0}\NormalTok{, }\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}2023{-}01{-}01\textquotesingle{}}\NormalTok{\})}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+---+----------+ -| id| name|age| date| -+---+-----+---+----------+ -| 1| John| 0|2023-04-05| -| 2|Alice| 25|2023-04-09| -| 3| Bob| 0|2023-04-12| -| 4| Jane| 30|2023-01-01| -| 5| Mike| 35|2023-01-01| -+---+-----+---+----------+ -\end{verbatim} - -\subsection{\texorpdfstring{Dropping all \texttt{null} -values}{Dropping all null values}}\label{dropping-all-null-values} - -Spark also offers the \texttt{na.drop()} and \texttt{dropna()} -DataFrames methods, which you can use to easily remove any row that -contains a null value on any column of the DataFrame. This is different -from \texttt{filter()} and \texttt{where()}, because on these two -methods you have to build a logical expression that translate ``not-null -values''. - -In contrast, on \texttt{na.drop()} and \texttt{dropna()} methods you do -not have a logical expression. You just call these methods, and they do -the heavy work for you. They search through the entire DataFrame. When -it identify a null value on the DataFrame, it removes the entire row -that contains such null value. - -For example, if we apply these methods on the \texttt{df} DataFrame that -we used on the previous section, this is the end result: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.na.drop()}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+---+----------+ -| id| name|age| date| -+---+-----+---+----------+ -| 2|Alice| 25|2023-04-09| -+---+-----+---+----------+ -\end{verbatim} - -\section{Union operations}\label{union-operations} - -When you have many individual DataFrames that have the same columns, and -you want to unify them into a single big DataFrame that have all the -rows from these different DataFrames, you want to perform an UNION -operation. - -An UNION operation works on a pair of DataFrames. It returns the -row-wise union of these two DataFrames. In \texttt{pyspark}, we perform -UNION operations by using the \texttt{union()} DataFrame method. To use -this method, you just provide the other DataFrame you want to make the -union with. So the expression \texttt{df1.union(df2)} creates a new -DataFrame which contains all the rows from both the \texttt{df1} and -\texttt{df2} DataFrames. - -Moreover, in commom SQL engines there are usually two kinds of UNION -operations, which are: \emph{union all} and \emph{union distinct}. When -you use an \emph{union all} operation, you are saying that you just want -to unifiy the two DataFrames, no matter what data you find in each one -of them. You do not care if duplicated values are generated in the -process, because an observation ``x'' might be present both on -\texttt{df1} and \texttt{df2}. - -In contrast, an \emph{union distinct} operation is the exact opposite of -that. It merges the rows from both DataFrames together, and then, it -removes all duplicated values from the result. So you use an \emph{union -distinct} operation when you want a single DataFrame that contains all -rows from both DataFrames \texttt{df1} and \texttt{df2}, but, you do not -want any duplicated rows into this single DataFrame. - -By default, the \texttt{union()} method always perform an \emph{union -all} operation. However, to do an \emph{union distinct} operation in -pyspark, you actually have to use the \texttt{union()} method in -conjunction with the \texttt{distinct()} or \texttt{drop\_duplicates()} -methods. In other words, there is not a direct method in pyspark that -performs an \emph{union distinct} operation on a single command. - -Look at the example below with \texttt{df1} and \texttt{df2} DataFrames. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df1 }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}F\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{5}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, }\StringTok{\textquotesingle{}Francis\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{),} -\NormalTok{]} - -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{5}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{7}\NormalTok{, }\StringTok{\textquotesingle{}Arthur\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}F\textquotesingle{}}\NormalTok{),} -\NormalTok{]} - -\NormalTok{df1 }\OperatorTok{=}\NormalTok{ spark.createDataFrame(df1, [}\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{])} -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ spark.createDataFrame(df2, [}\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{])} - -\CommentTok{\# An example of UNION ALL operation:} -\NormalTok{df1.union(df2).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-------+---+ -| ID| Name|Sex| -+---+-------+---+ -| 1| Anne| F| -| 5| Mike| M| -| 2|Francis| M| -| 5| Mike| M| -| 7| Arthur| M| -| 1| Anne| F| -+---+-------+---+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# An example of UNION DISTINCT operation} -\NormalTok{df1}\OperatorTok{\textbackslash{}} -\NormalTok{ .union(df2)}\OperatorTok{\textbackslash{}} -\NormalTok{ .distinct()}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-------+---+ -| ID| Name|Sex| -+---+-------+---+ -| 1| Anne| F| -| 5| Mike| M| -| 2|Francis| M| -| 7| Arthur| M| -+---+-------+---+ -\end{verbatim} - -Because an UNION operation merges the two DataFrames in a vertical way, -the columns between the two DataFrames must match. If the columns -between the two DataFrames are not in the same places, a mismatch -happens during the operation, and Spark will do nothing to fix your -mistake. - -Most programming languages would issue an error at this point, warning -you about this conflict between the columns found on each DataFrame and -their respective positions. However, in Spark, if the columns are out of -order, Spark will continue with the UNION operation, as if nothing was -wrong. Spark will not even raise a warning for you. Since this problem -can easily pass unnotice, be aware of it. - -In the example below, we have a third DataFrame called \texttt{df3}. -Notice that the columns in \texttt{df3} are the same of \texttt{df1} and -\texttt{df2}. However, the columns from \texttt{df3} are in a different -order than in \texttt{df1} and \texttt{df2}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}Marla\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}F\textquotesingle{}}\NormalTok{, }\DecValTok{9}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Andrew\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{, }\DecValTok{15}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Peter\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}M\textquotesingle{}}\NormalTok{, }\DecValTok{12}\NormalTok{)} -\NormalTok{]} -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, [}\StringTok{\textquotesingle{}Name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Sex\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{])} -\NormalTok{df3.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+---+---+ -| Name|Sex| ID| -+------+---+---+ -| Marla| F| 9| -|Andrew| M| 15| -| Peter| M| 12| -+------+---+---+ -\end{verbatim} - -If we try to perform an UNION operation between, let's say, \texttt{df2} -and \texttt{df3}, the operations just works. But, the end result of this -operation is not correct, as you can see in the example below. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df2.union(df3).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+------+---+ -| ID| Name|Sex| -+------+------+---+ -| 5| Mike| M| -| 7|Arthur| M| -| 1| Anne| F| -| Marla| F| 9| -|Andrew| M| 15| -| Peter| M| 12| -+------+------+---+ -\end{verbatim} - -Although this might be problematic, Spark provides an easy-to-use -solution when the columns are in different places between each -DataFrame. This solution is the \texttt{unionByName()} method. - -The difference between \texttt{union()} and \texttt{unionByName()} -methods, is that the \texttt{unionByName()} method makes an matching by -column name, before if performs the UNION. In other words, it compares -the column names found on each DataFrame and it matches each column by -its name. This way, the columns present on each DataFrame of the UNION -must have the same name, but they do not need to be in the same -positions on both DataFrames. - -If we use this method on the same example as above, you can see below -that we get a different result, and a correct one this time. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df2.unionByName(df3)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+------+---+ -| ID| Name|Sex| -+---+------+---+ -| 5| Mike| M| -| 7|Arthur| M| -| 1| Anne| F| -| 9| Marla| F| -| 15|Andrew| M| -| 12| Peter| M| -+---+------+---+ -\end{verbatim} - -Therefore, if you want to make an UNION operation between two -DataFrames, you can generally use the \texttt{union()} method. But if -you suspect the columns from these DataFrames might be in different -positions on each DataFrame, you can change to the -\texttt{unionByName()} method. - -In contrast, if the columns are different not only on position, but -also, on column name, then, \texttt{unionByName()} will not work. The -two DataFrames involved on an UNION operation must be very similar. If -they are not similar, then, you will have a hard time trying to do the -operation. - -Another problem that you might face is if you try to unify two -DataFrames that have different numbers of columns between them. In this -situation, it means that the two DataFrames have ``different widths'', -and, as a result of that, an \texttt{AnalysisException} error will be -raised by Spark if you try to unify them with an UNION operation, like -in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ (} -\NormalTok{ StructField,} -\NormalTok{ StructType,} -\NormalTok{ LongType} -\NormalTok{)} - -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([StructField(}\StringTok{\textquotesingle{}ID\textquotesingle{}}\NormalTok{, LongType(), }\VariableTok{False}\NormalTok{)])} -\NormalTok{df4 }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{19}\NormalTok{,), (}\DecValTok{17}\NormalTok{,), (}\DecValTok{16}\NormalTok{,)} -\NormalTok{]} -\NormalTok{df4 }\OperatorTok{=}\NormalTok{ spark.createDataFrame(df4, schema)} -\NormalTok{df3.union(df4).show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AnalysisException: Union can only be performed on tables -with the same number of columns, but the first table has -3 columns and the second table has 1 columns; -'Union false, false -:- LogicalRDD [Name#703, Sex#704, ID#705L], false -+- LogicalRDD [ID#762L], false -\end{verbatim} - -\section{Join operations}\label{join-operations} - -A JOIN operation is another very commom operation that is also used to -bring data from scattered sources into a single unified DataFrame. In -pyspark, we can build JOIN operations by using the \texttt{join()} -DataFrame method. This method accepts three arguments, which are: - -\begin{itemize} -\tightlist -\item - \texttt{other}: the DataFrame you want to JOIN with (i.e.~the - DataFrame on the right side of the JOIN); -\item - \texttt{on}: a column name, or a list of column names, that represents - the key (or keys) of the JOIN; -\item - \texttt{how}: the kind of JOIN you want to perform (inner, full, left, - right); -\end{itemize} - -As a first example, let's use the \texttt{info} and -\texttt{band\_instruments} DataFrames. With the source code below, you -can quickly re-create these two DataFrames in your session: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}Mick\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Rolling Stones\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}1943{-}07{-}26\textquotesingle{}}\NormalTok{, }\VariableTok{True}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}John\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beatles\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}1940{-}09{-}10\textquotesingle{}}\NormalTok{, }\VariableTok{True}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Paul\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beatles\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}1942{-}06{-}18\textquotesingle{}}\NormalTok{, }\VariableTok{True}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}George\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beatles\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}1943{-}02{-}25\textquotesingle{}}\NormalTok{, }\VariableTok{True}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Ringo\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Beatles\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}1940{-}07{-}07\textquotesingle{}}\NormalTok{, }\VariableTok{True}\NormalTok{)} -\NormalTok{]} - -\NormalTok{info }\OperatorTok{=}\NormalTok{ spark.createDataFrame(} -\NormalTok{ info,} -\NormalTok{ [}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}band\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}born\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}children\textquotesingle{}}\NormalTok{]} -\NormalTok{)} - -\NormalTok{band\_instruments }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}John\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}guitar\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Paul\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}bass\textquotesingle{}}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Keith\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}guitar\textquotesingle{}}\NormalTok{)} -\NormalTok{]} - -\NormalTok{band\_instruments }\OperatorTok{=}\NormalTok{ spark.createDataFrame(} -\NormalTok{ band\_instruments,} -\NormalTok{ [}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}plays\textquotesingle{}}\NormalTok{]} -\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -If you look closely to these two DataFrames, you will probably notice -that they both describe musicians from two famous rock bands from 60's -and 70's. The \texttt{info} DataFrame have more personal or general -informations about the musicians, while the \texttt{band\_instruments} -DataFrame have only data about the main musical instruments that they -play. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+--------------+----------+--------+ -| name| band| born|children| -+------+--------------+----------+--------+ -| Mick|Rolling Stones|1943-07-26| true| -| John| Beatles|1940-09-10| true| -| Paul| Beatles|1942-06-18| true| -|George| Beatles|1943-02-25| true| -| Ringo| Beatles|1940-07-07| true| -+------+--------------+----------+--------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{band\_instruments.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----+------+ -| name| plays| -+-----+------+ -| John|guitar| -| Paul| bass| -|Keith|guitar| -+-----+------+ -\end{verbatim} - -It might be of your interest, to have a single DataFrame that contains -both the personal information and the musical instrument of each -musician. In this case, you can build a JOIN operation between these -DataFrames to get this result. An example of this JOIN in pyspark would -be: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info.join(band\_instruments, on }\OperatorTok{=} \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, how }\OperatorTok{=} \StringTok{\textquotesingle{}left\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+--------------+----------+--------+------+ -| name| band| born|children| plays| -+------+--------------+----------+--------+------+ -| Mick|Rolling Stones|1943-07-26| true| NULL| -| John| Beatles|1940-09-10| true|guitar| -| Paul| Beatles|1942-06-18| true| bass| -|George| Beatles|1943-02-25| true| NULL| -| Ringo| Beatles|1940-07-07| true| NULL| -+------+--------------+----------+--------+------+ -\end{verbatim} - -In the example above, we are performing a \emph{left join} between the -two DataFrames, using the \texttt{name} column as the JOIN key. Now, we -have a single DataFrame with all 5 columns from both DataFrames -(\texttt{plays}, \texttt{children}, \texttt{name}, \texttt{band} and -\texttt{born}). - -\subsection{What is a JOIN ?}\label{what-is-a-join} - -I imagine you are already familiar with JOIN operations. However, in -order to build good and precise JOIN operations, is very important to -know what a JOIN operation actually is. So let's revisit it. - -A JOIN operation merges two different DataFrames together into a single -unified DataFrame. It does this by using a column (or a set of columns) -as keys to identify the observations of both DataFrames, and connects -these observations together. - -A JOIN (like UNION) is also an operation that works on a pair of -DataFrames. It is very commom to refer to this pair as ``the sides of -the JOIN''. That is, the DataFrame on the left side of the JOIN, and the -DataFrame on the right side of the JOIN. Or also, the DataFrames ``A'' -(left side) and ``B'' (right side). - -The main idea (or objective) of the JOIN is to bring all data from the -DataFrame on the right side, into the DataFrame on the left side. In -other words, a JOIN between DataFrames A and B results into a DataFrame -C which contains all columns and rows from both DataFrames A and B. - -In an UNION operation, both DataFrames must have the same columns, -because in an UNION operation you are concatenating both DataFrames -together vertically, so the number of columns (or the ``width'' of the -tables) need to match. However, in a JOIN operation, both DataFrames -only need to have at least one column in commom. Apart from that, in a -JOIN, both DataFrames can have very different structure and columns from -each other. - -One key characteristic of JOIN operations is it's key matching -mechanism. A JOIN uses the columns you provide to \textbf{build a key}. -This key is used to identify rows (or ``observations'') in both -DataFrames. In other words, these keys identifies relationships between -the two DataFrames. These relations are vital to the JOIN. - -If we go back to \texttt{info} and \texttt{band\_instruments} -DataFrames, and analyse them for a bit more, we can see that they both -have a \texttt{name} column which contains the name of the musician -being described on the current row. This \texttt{name} column can be -used as \textbf{the key} of the JOIN. Because this column is available -on both DataFrames, and it can be used to identify a single observation -(or a single musician) present in each DataFrame. - -So the JOIN key is a column (or a combination of columns) that can -identify what observations are (and are not) present on both DataFrames. -At Figure~\ref{fig-keys-comparison}, we can see the observations from -\texttt{info} and \texttt{band\_instruments} in a visual manner. You see -in the figure that both Paul and John are described in both DataFrames. -At the same time, Ringo, Mick and George are present only on -\texttt{info}, while \texttt{Keith} is only at -\texttt{band\_instruments}. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/keys_comparacao.png} - -} - -\caption{\label{fig-keys-comparison}The relations between \texttt{info} -and \texttt{band\_instruments} DataFrames} - -\end{figure}% - -In a certain way, you can see the JOIN key as a way to \textbf{identify -relationships between the two DataFrames}. A JOIN operation use these -relationships to merge your DataFrames in a precise way. A JOIN does not -simply horizontally glue two DataFrames together. It uses the JOIN key -to perform a matching process between the observations of the two -DataFrames. - -This matching process ensures that the data present DataFrame ``B'' is -correctly transported to the DataFrame ``A''. In other words, it ensures -that the oranges are paired with oranges, apples with apples, bananas -with bananas, you got it. - -Just to describe visually what this matching process is, we have the -Figure~\ref{fig-join-matching} below. In this figure, we have two -DataFrames on the left and center of the image, which represents the -inputs of the JOIN. We also have a third DataFrame on the right side of -the image, which is the output (or the result) of the JOIN. - -In this specific example, the column that represents the JOIN key is the -\texttt{ID} column. Not only this column is present on both DataFrames, -but it also represents an unique identifier to each person described in -both tables. And that is precisely the job of a JOIN key. It represents -a way to identify observations (or ``persons'', or ``objects'', etc.) on -both tables. - -You can see at Figure~\ref{fig-join-matching}, that when the \texttt{ID} -100 is found on the 1st row of the left DataFrame, the JOIN initiates a -lookup/matching process on the center DataFrame, looking for a row in -the DataFrame that matches this \texttt{ID} 100. When it finds this -\texttt{ID} 100 (on the 4th row of the center DataFrame), it captures -and connects these two rows on both DataFrames, because these rows -describes the same person (or observation), and because of that, they -should be connected. This same matching process happens for all -remaining \texttt{ID} values. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/pareamento1.png} - -} - -\caption{\label{fig-join-matching}The matching process of a JOIN -operation} - -\end{figure}% - -\subsection{The different types of -JOIN}\label{the-different-types-of-join} - -JOIN operations actually comes in different flavours (or types). The -four main known types of JOINs are: \emph{full}, \emph{left}, -\emph{right} and \emph{inner}. All of these different types of JOIN -perform the same steps and matching processes that we described on the -previous section. But they differ on the treatment they do to unmatched -observations. In other words, these different types of JOINs differ on -\textbf{what they do in cases when an observation is not found on both -DataFrames of the JOIN} (e.g.~when an observation is found only on table -A). - -In other words, all these four types will perform the same matching -process between the two DataFrames, and will connect observations that -are found in both DataFrames. However, which rows are included in the -final output is what changes between each type (or ``flavour'') of JOIN. - -In this situation, the words ``left'' and ``right'' are identifiers to -the DataFrames involved on the JOIN operation. That is, the word -\emph{left} refers to the DataFrame on the left side of the JOIN, while -the word \emph{right} refers to the DataFrame on the right side of the -JOIN. - -A very useful way of understanding these different types of JOINs is to -represent both DataFrames as numerical sets (as we learn in -mathematics). The Figure~\ref{fig-join-sets} gives you a visual -representation of each type of JOIN using this ``set model'' of -representing JOINs. Remember, all of these different types of JOIN work -the same way, they just do different actions when an observation is not -found on both tables. - -The most ``complete'' and ``greedy'' type of JOIN is the \emph{full -join}. Because this type returns all possible combinations of both -DataFrames. In other words, this type of JOIN will result in a DataFrame -that have all observations from both DataFrames. It does not matter if -an observation is present only on table A, or only on table B, or maybe, -on both tables. A \emph{full join} will always try to connect as much -observation as it can. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/join-sets.png} - -} - -\caption{\label{fig-join-sets}A visual representation for types of JOIN -using numerical sets} - -\end{figure}% - -That is why the \emph{full join} is represented on -Figure~\ref{fig-join-sets} as the union between the two tables (or the -two sets). In contrast, an \emph{inner join} is the intersection of the -two tables (or two sets). That is, an \emph{inner join} will result in a -new DataFrame which contains solely the observations that could be found -on both tables. If a specific observation is found only on one table of -the JOIN, this observation will be automatically removed from the result -of the \emph{inner join}. - -If we go back to the \texttt{info} and \texttt{band\_instruments} -DataFrames, and use them as an example, you can see that only Paul and -John are included on the result of an \emph{inner join}. While in a -\emph{full join}, all musicians are included on the resulting DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# An inner join between \textasciigrave{}info\textasciigrave{} and \textasciigrave{}band\_instruments\textasciigrave{}:} -\NormalTok{info.join(band\_instruments, on }\OperatorTok{=} \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, how }\OperatorTok{=} \StringTok{\textquotesingle{}inner\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----+-------+----------+--------+------+ -|name| band| born|children| plays| -+----+-------+----------+--------+------+ -|John|Beatles|1940-09-10| true|guitar| -|Paul|Beatles|1942-06-18| true| bass| -+----+-------+----------+--------+------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# A full join between \textasciigrave{}info\textasciigrave{} and \textasciigrave{}band\_instruments\textasciigrave{}:} -\NormalTok{info.join(band\_instruments, on }\OperatorTok{=} \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, how }\OperatorTok{=} \StringTok{\textquotesingle{}full\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+--------------+----------+--------+------+ -| name| band| born|children| plays| -+------+--------------+----------+--------+------+ -|George| Beatles|1943-02-25| true| NULL| -| John| Beatles|1940-09-10| true|guitar| -| Keith| NULL| NULL| NULL|guitar| -| Mick|Rolling Stones|1943-07-26| true| NULL| -| Paul| Beatles|1942-06-18| true| bass| -| Ringo| Beatles|1940-07-07| true| NULL| -+------+--------------+----------+--------+------+ -\end{verbatim} - -On the other hand, the \emph{left join} and \emph{right join} are kind -of self-explanatory. On a \emph{left join}, all the observations from -the left DataFrame are kept intact on the resulting DataFrame of the -JOIN, regardless of whether these observations were found or not on the -right DataFrame. In contrast, an \emph{right join} is the opposite of -that. So, all observations from the right DataFrame are kept intact on -the resulting DataFrame of the JOIN. - -In pyspark, you can define the type of JOIN you want to use by setting -the \texttt{how} argument at \texttt{join()} method. This argument -accepts a string with the type of JOIN you want to use as input. - -\begin{itemize} -\tightlist -\item - \texttt{how\ =\ \textquotesingle{}left\textquotesingle{}}: make a - \emph{left join}; -\item - \texttt{how\ =\ \textquotesingle{}right\textquotesingle{}}: make a - \emph{right join}; -\item - \texttt{how\ =\ \textquotesingle{}full\textquotesingle{}}: make a - \emph{full join}; -\item - \texttt{how\ =\ \textquotesingle{}inner\textquotesingle{}}: make an - \emph{inner join}; -\item - \texttt{how\ =\ \textquotesingle{}semi\textquotesingle{}}: make a - \emph{semi join}; -\item - \texttt{how\ =\ \textquotesingle{}anti\textquotesingle{}}: make an - \emph{anti join}; -\end{itemize} - -You can see on the list above, that \texttt{pyspark} do have two more -types of JOINs, which are the \emph{semi join} and \emph{anti join}. -These are ``filtering types'' of JOINs. Because they perform the -matching process, and only filter the rows from table A (i.e.~the -DataFrame on the left side of the JOIN) based on the matches found on -table B (i.e.~the DataFrame on the right side of the JOIN). - -In other words, these both types are used as a filter mechanism, and not -as a merge mechanism. When you use these two types, instead of merging -two DataFrames together, you are interested in filtering the rows of -DataFrame A based on the existence of these rows in DataFrame B. - -This is different from what we learned on \emph{left}, \emph{right}, -\emph{full} and \emph{inner} types, because they do not only change -which rows are included in the final result, but they also add the -columns from table B into table A. Because of this behavior, these four -main types are usually called as ``additive types'' of JOIN, since they -are always adding data from table B into table A, i.e.~they are merging -the two tables together. - -In more details, an \emph{anti join} perform the exact opposite matching -process of an \emph{inner join}. This means that an \emph{anti join} -will always result in a new DataFrame that contains solely the -observations that exists only on one DataFrame of the JOIN. In other -words, the observations that are found on both tables are automatically -removed from the resulting DataFrame of the JOIN. If we look at the -example below, we can see that both John and Paul were removed from the -resulting DataFrame of the \emph{anti join}, because these two musicians -are present on both DataFrames: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info.join(band\_instruments, on }\OperatorTok{=} \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, how }\OperatorTok{=} \StringTok{\textquotesingle{}anti\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+--------------+----------+--------+ -| name| band| born|children| -+------+--------------+----------+--------+ -| Mick|Rolling Stones|1943-07-26| true| -|George| Beatles|1943-02-25| true| -| Ringo| Beatles|1940-07-07| true| -+------+--------------+----------+--------+ -\end{verbatim} - -In contrast, a \emph{semi join} is equivalent to an \emph{inner join}, -with the difference that it does not adds the column from table B into -table A. So this type of JOIN filter the rows from DataFrame A that also -exists in DataFrame B. If an observation is found on both tables, this -observation will appear on the resulting DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info.join(band\_instruments, on }\OperatorTok{=} \StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, how }\OperatorTok{=} \StringTok{\textquotesingle{}semi\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----+-------+----------+--------+ -|name| band| born|children| -+----+-------+----------+--------+ -|John|Beatles|1940-09-10| true| -|Paul|Beatles|1942-06-18| true| -+----+-------+----------+--------+ -\end{verbatim} - -Just to keep using our visual model of sets, on -Figure~\ref{fig-join-sets2} you can see the \emph{semi} and \emph{anti} -JOIN types represented as numerical sets. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/join-sets2.png} - -} - -\caption{\label{fig-join-sets2}The two ``filter types'' of JOIN} - -\end{figure}% - -\subsection{A cross JOIN as the seventh -type}\label{a-cross-join-as-the-seventh-type} - -We described six different types of JOINs on the previous section. But -Spark also offers a seventh type of JOIN called \emph{cross join}. This -is a special type of JOIN that you can use by calling the -\texttt{crossJoin()} DataFrame method. - -In essence, a \emph{cross join} returns, as output, the cartesian -product between two DataFrames. It is similar to R functions -\href{https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/expand.grid}{\texttt{base::expand.grid()}}\footnote{\url{https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/expand.grid}} -or -\href{https://tidyr.tidyverse.org/reference/expand.html}{\texttt{dplyr::expand()}}\footnote{\url{https://tidyr.tidyverse.org/reference/expand.html}}, -and also, the Python equivalent -\href{https://docs.python.org/2/library/itertools.html\#itertools.product}{\texttt{itertools.product()}}\footnote{\url{https://docs.python.org/2/library/itertools.html\#itertools.product}}. - -This is a type of JOIN that you should avoid to use, specially if one -(or both) of the DataFrames involved is a big DataFrame with -thousands/millions of rows. Because a \emph{cross join} will always -produce a cartesian product between the two DataFrames involved. This -means that, if DataFrame A contains \(x\) rows, and DataFrame B contains -\(y\) rows, the end result of the \emph{cross join} is a new DataFrame C -that contains \(x \times y\) rows. - -In other words, the number of rows in the output of a \emph{cross join} -can grow exponentially. For example, a \emph{cross join} between a -DataFrame of 1 thousand rows, and another DataFrame of 10 thousand of -rows (both are small DataFrames for the scale and sizes of a real-world -big data environment), would produce a DataFrame with -\(10^3 \times 10^4 = 10^7\), that is, 10 milion of rows as output. - -In a big data environment, dealing with something that grows -exponentially\ldots{} it is never a good idea. So try to avoid a -\emph{cross join} and use him solely on very small DataFrames. - -As an example, to apply a \emph{cross join} between \texttt{info} and -\texttt{band\_instruments} DataFrames we can use the -\texttt{crossJoin()} method, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{info.crossJoin(band\_instruments)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - - -\begin{verbatim} -+------+--------------+----------+--------+-----+------+ -| name| band| born|children| name| plays| -+------+--------------+----------+--------+-----+------+ -| Mick|Rolling Stones|1943-07-26| true| John|guitar| -| Mick|Rolling Stones|1943-07-26| true| Paul| bass| -| Mick|Rolling Stones|1943-07-26| true|Keith|guitar| -| John| Beatles|1940-09-10| true| John|guitar| -| John| Beatles|1940-09-10| true| Paul| bass| -| John| Beatles|1940-09-10| true|Keith|guitar| -| Paul| Beatles|1942-06-18| true| John|guitar| -| Paul| Beatles|1942-06-18| true| Paul| bass| -| Paul| Beatles|1942-06-18| true|Keith|guitar| -|George| Beatles|1943-02-25| true| John|guitar| -|George| Beatles|1943-02-25| true| Paul| bass| -|George| Beatles|1943-02-25| true|Keith|guitar| -| Ringo| Beatles|1940-07-07| true| John|guitar| -| Ringo| Beatles|1940-07-07| true| Paul| bass| -| Ringo| Beatles|1940-07-07| true|Keith|guitar| -+------+--------------+----------+--------+-----+------+ -\end{verbatim} - -A \emph{cross join} is a special type of JOIN because it does not use -``keys'' and a matching process. It just computes every possible -combination between the rows from both DataFrames. Because of the -absence of these keys characteristics of a JOIN, many data analysts and -engineers would not call a \emph{cross join} as a type of JOIN (in other -words, they would call it a type of something else). But regardless of -our opinions, Spark decided to call this process as the \emph{cross -join}, so this is the way we are calling this process on this book. - -\section{Pivot operations}\label{pivot-operations} - -Pivot operations are extremely useful, and they are probably the main -operation you can use to completely reformat your table. What these -operations do is basically change the dimensions of your table. In other -words, this kind of operation transform columns into rows, or vice -versa. - -As a comparison with other data frameworks, a pivot operation in Spark -is the same operation performed by R functions -\href{https://tidyr.tidyverse.org/reference/pivot_longer.html}{\texttt{tidyr::pivot\_longer()}}\footnote{\url{https://tidyr.tidyverse.org/reference/pivot_longer.html}} -and -\href{https://tidyr.tidyverse.org/reference/pivot_wider.html}{\texttt{tidyr::pivot\_wider()}}\footnote{\url{https://tidyr.tidyverse.org/reference/pivot_wider.html}} -from the famous R framework \texttt{tidyverse}; or, the same as the -\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html}{\texttt{pivot()}}\footnote{\url{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html}} -and -\href{https://pandas.pydata.org/docs/reference/api/pandas.melt.html}{\texttt{melt()}}\footnote{\url{https://pandas.pydata.org/docs/reference/api/pandas.melt.html}} -methods from the Python framework \texttt{pandas}. - -In Spark, pivot operations are performed by the \texttt{pivot()} -DataFrame method, and by the \texttt{stack()} Spark SQL function. Pivot -transformations are available in both directions. That is, you can -transform either rows into columns (corresponds to \texttt{pivot()}), -or, columns into rows (corresponds to \texttt{stack()}). Let's begin -with \texttt{stack()}, and after that, we explain the \texttt{pivot()} -method. - -\subsection{Transforming columns into -rows}\label{transforming-columns-into-rows} - -The \texttt{stack()} Spark SQL function allows you to transform columns -into rows. In other words, you can make your DataFrame ``longer'' with -this kind of operation, because you remove columns (``width'') from the -table, and adds new rows (``heigth''). This gives an aspect of -``longer'' to your table, because after this operation, you table -usually have more rows than columns. - -As a first example, lets use the \texttt{religion} DataFrame, which you -can re-create in your session with the source code below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}Agnostic\textquotesingle{}}\NormalTok{, }\DecValTok{27}\NormalTok{, }\DecValTok{34}\NormalTok{, }\DecValTok{60}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Atheist\textquotesingle{}}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{27}\NormalTok{, }\DecValTok{37}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}Buddhist\textquotesingle{}}\NormalTok{, }\DecValTok{27}\NormalTok{, }\DecValTok{21}\NormalTok{, }\DecValTok{30}\NormalTok{)} -\NormalTok{]} -\NormalTok{cols }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}religion\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}\textless{}$10k\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}$10k{-}$20k\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}$20k{-}$30k\textquotesingle{}}\NormalTok{]} -\NormalTok{religion }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, cols)} -\NormalTok{religion.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+-----+---------+---------+ -|religion|<$10k|$10k-$20k|$20k-$30k| -+--------+-----+---------+---------+ -|Agnostic| 27| 34| 60| -| Atheist| 12| 27| 37| -|Buddhist| 27| 21| 30| -+--------+-----+---------+---------+ -\end{verbatim} - -This DataFrame is showing us the average salary of people belonging to -different religious groups. In each column of this DataFrame, you have -data for a specific salary level (or range). This is a structure that -can be easy and intuitive for some specific operations, but it also -might impose some limitations, specially if you need to apply a -vectorised operation over these salary ranges. - -The basic unit of this DataFrame are the religious groups, and the -salary ranges represents a characteristic of these groups. The different -salary ranges are distributed across different columns. But what if we -transformed these multiple columns into multiple rows? How can we -accomplish that? - -What we need to do, is to concentrate the labels (or the column names) -of salary ranges into a single column, and move the respective values of -the salary levels into another column. In other words, we need to create -a column that contains the labels, and another column that contains the -values. Figure~\ref{fig-pivot1} have a visual representation of this -process: - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/pivot1.png} - -} - -\caption{\label{fig-pivot1}A visual representation of a pivot operation} - -\end{figure}% - -Let's build this transformation in \texttt{pyspark}. First, remember -that \texttt{stack()} is not a DataFrame method. It is a Spark SQL -function. However, it is not an exported Spark SQL function, which means -that you cannot import this function from the -\texttt{pyspark.sql.function} module. This means that \texttt{stack()} -will never be directly available in your python session to use. - -So how do you use it? The answer is: use it inside Spark SQL! The -\texttt{stack()} function is not available directly in python, but it is -always available in Spark SQL, so all you need to do, is to use -\texttt{stack()} inside functions and methods such as \texttt{expr()} -(that I introduced at Section~\ref{sec-sql-expr}), or \texttt{sql()} to -access Spark SQL functionality. - -Now, the \texttt{stack()} function have two main arguments, which are -the number of columns to transform into rows, and a sequence of -key-value pairs that describes which columns will be transformed into -rows, and the label values that corresponds to each column being -transformed. - -As a first example, the source code below replicates the transformation -exposed at Figure~\ref{fig-pivot1}: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ expr} -\NormalTok{stack\_expr }\OperatorTok{=} \StringTok{"""} -\StringTok{stack(3,} -\StringTok{ \textquotesingle{}\textless{}$10k\textquotesingle{}, \textasciigrave{}\textless{}$10k\textasciigrave{},} -\StringTok{ \textquotesingle{}$10k{-}$20k\textquotesingle{}, \textasciigrave{}$10k{-}$20k\textasciigrave{},} -\StringTok{ \textquotesingle{}$20k{-}$30k\textquotesingle{}, \textasciigrave{}$20k{-}$30k\textasciigrave{}} -\StringTok{) AS (salary\_range, avg\_salary)} -\StringTok{"""} - -\NormalTok{longer\_religion }\OperatorTok{=}\NormalTok{ religion}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}religion\textquotesingle{}}\NormalTok{, expr(stack\_expr))} - -\NormalTok{longer\_religion.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+------------+----------+ -|religion|salary_range|avg_salary| -+--------+------------+----------+ -|Agnostic| <$10k| 27| -|Agnostic| $10k-$20k| 34| -|Agnostic| $20k-$30k| 60| -| Atheist| <$10k| 12| -| Atheist| $10k-$20k| 27| -| Atheist| $20k-$30k| 37| -|Buddhist| <$10k| 27| -|Buddhist| $10k-$20k| 21| -|Buddhist| $20k-$30k| 30| -+--------+------------+----------+ -\end{verbatim} - -An important aspect about the \texttt{stack()} function, is that it -always outputs two new columns (one column for the labels - or the keys, -and another for the values). In the example above, these new columns are -\texttt{salary\_range} and \texttt{avg\_salary}. - -The first column identifies from which column (before the -\texttt{stack()} operation) the value present at \texttt{avg\_salary} -came from. This means that this first column produced by -\texttt{stack()} works as a column of labels or identifiers. These -labels identify from which of the three transformed columns -(\texttt{\textless{}\$10k}, \texttt{\$10k-\$20k} and -\texttt{\$20k-\$30k}) the row value came from. In the visual -representation exposed at Figure~\ref{fig-pivot1}, this ``labels -column'' is the \texttt{income} column. - -On the other hand, the second column in the output of \texttt{stack()} -contains the actual values that were present on the columns that were -transformed. This ``values column'' in the example above corresponds to -the column \texttt{avg\_salary}, while in the visual representation -exposed at Figure~\ref{fig-pivot1}, it is the \texttt{values} column. - -The first argument in \texttt{stack()} is always the number of columns -that will be transformed by the function into rows. In our example, we -have three columns that we want to transform, which are -\texttt{\textless{}\$10k}, \texttt{\$10k-\$20k} and -\texttt{\$20k-\$30k}. That is why we have the number 3 as the first -argument to \texttt{stack()}. - -After that, we have a sequence of key-value pairs. In each pair, the -value side (i.e.~the right side) of the pair contains the name of the -column that will be transformed, and the key side (i.e.~the left side) -of the pair contains the ``label value'', or, in other words, which -value represents, marks, label, or identifies the values that came from -the column described in the right side of the pair. - -Normally, you set the label value to be equivalent to the column name. -That is, both sides of each pair are usually pretty much the same. But -you can change this behaviour if you want. In the example below, all -values that came from the \texttt{\textless{}\$10k} are labeled as -\texttt{"Below\ \$10k"}, while the values from the \texttt{\$10k-\$20k} -column, are labeled in the output as \texttt{"Between\ \$10k-\$20k"}, -etc. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{stack\_expr }\OperatorTok{=} \StringTok{"""} -\StringTok{stack(3,} -\StringTok{ \textquotesingle{}Below $10k\textquotesingle{}, \textasciigrave{}\textless{}$10k\textasciigrave{},} -\StringTok{ \textquotesingle{}Between $10k{-}$20k\textquotesingle{}, \textasciigrave{}$10k{-}$20k\textasciigrave{},} -\StringTok{ \textquotesingle{}Between $20k{-}$30k\textquotesingle{}, \textasciigrave{}$20k{-}$30k\textasciigrave{}} -\StringTok{) AS (salary\_range, avg\_salary)} -\StringTok{"""} - -\NormalTok{religion}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}religion\textquotesingle{}}\NormalTok{, expr(stack\_expr))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+-----------------+----------+ -|religion| salary_range|avg_salary| -+--------+-----------------+----------+ -|Agnostic| Below $10k| 27| -|Agnostic|Between $10k-$20k| 34| -|Agnostic|Between $20k-$30k| 60| -| Atheist| Below $10k| 12| -| Atheist|Between $10k-$20k| 27| -| Atheist|Between $20k-$30k| 37| -|Buddhist| Below $10k| 27| -|Buddhist|Between $10k-$20k| 21| -|Buddhist|Between $20k-$30k| 30| -+--------+-----------------+----------+ -\end{verbatim} - -Furthermore, because the \texttt{stack()} function always outputs two -new columns, if you want to rename these two new columns being created, -to give them more readable and meaningful names, you always need to -provide two new column names at once, inside a tuple, to the \texttt{AS} -keyword. - -In the example above, this tuple is -\texttt{(salary\_range,\ avg\_salary)}. The first value in the tuple is -the new name for the ``labels column'', while the second value in the -tuple, is the new name for the ``values column''. - -Now, differently from other Spark SQL functions, the \texttt{stack()} -function should not be used inside the \texttt{withColumn()} method, and -the reason for this is very simple: \texttt{stack()} always returns two -new columns as output, but the \texttt{withColumn()} method can only -create one column at a time. - -This is why you get an \texttt{AnalysisException} error when you try to -use \texttt{stack()} inside \texttt{withColumn()}, like in the example -below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{religion}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}salary\_ranges\textquotesingle{}}\NormalTok{, stack\_expr)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AnalysisException: The number of aliases supplied in the AS claus -e -does not match the number of columns output by the UDTF expected -2 aliases but got salary_ranges -\end{verbatim} - -\subsection{Transforming rows into -columns}\label{transforming-rows-into-columns} - -On the other side, if you want to transform rows into columns in your -Spark DataFrame, you can use the \texttt{pivot()} method. One key aspect -of the \texttt{pivot()} method, is that it must always be used in -conjunction with the \texttt{groupby()} method that we introduced at -Section~\ref{sec-group-by}. In other words, \texttt{pivot()} does not -work without \texttt{groupby()}. - -You can think (or interpret) that the \texttt{groupby()} method does the -job of defining (or identifying) which columns will be present in the -output of \texttt{pivot()}. For example, if your DataFrame contains five -columns, which are \texttt{A}, \texttt{B}, \texttt{C}, \texttt{D} and -\texttt{E}; but you only listed columns \texttt{A} and \texttt{C} inside -\texttt{groupby()}, this means that if you perform a pivot operation -after that, the columns \texttt{B}, \texttt{D} and \texttt{E} will not -be present in the output of \texttt{pivot()}. These three columns -(\texttt{B}, \texttt{D} and \texttt{E}) will be automatically dropped -during the pivot operation. - -In contrast, the \texttt{pivot()} method does the job of identifying a -single column containing the values that will be transformed into new -columns. In other words, if you list the column \texttt{car\_brands} -inside \texttt{pivot()}, and, this column contains four unique values, -for example, \texttt{Audi}, \texttt{BMW}, \texttt{Jeep} and -\texttt{Fiat}, this means that, in the output of \texttt{pivot()}, four -new columns will be created, named as \texttt{Audi}, \texttt{BMW}, -\texttt{Jeep} and \texttt{Fiat}. - -Therefore, we use \texttt{groupby()} to define the columns that will be -kept intact on the output of the pivot operation; we use -\texttt{pivot()} to mark the column that contains the rows that we want -to transform into new columns; at last, we must learn how to define -which values will populate the new columns that will be created. Not -only that, we also need to specify how these values will be calculated. -And for that, we need to use an aggregating function. - -This is really important, you can not do a pivot operation without -aggregating the values that will compose (or populate) the new columns -you are creating. Without it, Spark will not let you do the pivot -operation using the \texttt{pivot()} method. - -As a first example, let's return to the \texttt{religion} DataFrame. -More specifically, to the \texttt{longer\_religion} DataFrame, which is -the pivoted version of the \texttt{religion} DataFrame that we created -on the previous section, using the \texttt{stack()} function. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{longer\_religion.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+------------+----------+ -|religion|salary_range|avg_salary| -+--------+------------+----------+ -|Agnostic| <$10k| 27| -|Agnostic| $10k-$20k| 34| -|Agnostic| $20k-$30k| 60| -| Atheist| <$10k| 12| -| Atheist| $10k-$20k| 27| -| Atheist| $20k-$30k| 37| -|Buddhist| <$10k| 27| -|Buddhist| $10k-$20k| 21| -|Buddhist| $20k-$30k| 30| -+--------+------------+----------+ -\end{verbatim} - -We can use this \texttt{longer\_religion} DataFrame and the -\texttt{pivot()} method to perform the inverse operation we described at -Figure~\ref{fig-pivot1}. In other words, we can re-create the -\texttt{religion} DataFrame through \texttt{longer\_religion} using the -\texttt{pivot()} method. The source code below demonstrates how we could -do such thing: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ first} - -\CommentTok{\# Equivalent to the \textasciigrave{}religion\textasciigrave{} DataFrame:} -\NormalTok{longer\_religion}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}religion\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .pivot(}\StringTok{\textquotesingle{}salary\_range\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(first(}\StringTok{\textquotesingle{}avg\_salary\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+---------+---------+-----+ -|religion|$10k-$20k|$20k-$30k|<$10k| -+--------+---------+---------+-----+ -|Agnostic| 34| 60| 27| -|Buddhist| 21| 30| 27| -| Atheist| 27| 37| 12| -+--------+---------+---------+-----+ -\end{verbatim} - -In the example above, you can see the three core parts that we -described: 1) use \texttt{groupby()} to define which columns will be -preserved from the input DataFrame; 2) use \texttt{pivot()} to define -which column will be used to transform rows into new columns; 3) the -aggregating functions describing which values will be used, and how they -are going to be calculated - -\texttt{agg(first(\textquotesingle{}avg\_salary\textquotesingle{}))}. -Figure~\ref{fig-pivot2} exposes these core parts in a visual manner. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/pivot2.png} - -} - -\caption{\label{fig-pivot2}The three core parts that define the use of -\texttt{pivot()}} - -\end{figure}% - -When you use \texttt{pivot()}, you have to apply an aggregating function -over the column from which you want to extract the values that will -populate the new columns created from the pivot operation. This is a -prerequisite, because Spark needs to know what he must do in case he -finds two (or more) values that are mapped to the same cell in the pivot -operation. - -In the above example, we used the \texttt{first()} function to aggregate -the values from the \texttt{avg\_salary} column. With this function, we -are telling Spark, that if it finds two (or more) values that are mapped -the same cell, then, Spark should pick the \emph{first value if finds} -in the input DataFrame, and simply ignore the remaining values. - -Let's see an example. In the code chunk below, we are creating a new -DataFrame called \texttt{df}: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\StringTok{\textquotesingle{}2023{-}05{-}01\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{, }\DecValTok{1}\NormalTok{, }\DecValTok{15}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}2023{-}05{-}02\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{25}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}2023{-}05{-}02\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{34}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}2023{-}05{-}02\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{21}\NormalTok{),} -\NormalTok{ (}\StringTok{\textquotesingle{}2023{-}05{-}03\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}Dani\textquotesingle{}}\NormalTok{, }\DecValTok{3}\NormalTok{, }\DecValTok{18}\NormalTok{)} -\NormalTok{]} -\NormalTok{cols }\OperatorTok{=}\NormalTok{ [}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}id\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{]} -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, cols)} -\NormalTok{df.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+----+---+-----+ -| date|name| id|value| -+----------+----+---+-----+ -|2023-05-01|Anne| 1| 15| -|2023-05-02|Mike| 2| 25| -|2023-05-02|Mike| 2| 34| -|2023-05-02|Mike| 2| 21| -|2023-05-03|Dani| 3| 18| -+----------+----+---+-----+ -\end{verbatim} - -Let's suppose you want to transform the rows in the \texttt{name} column -into new columns, and, populate these new columns with the values from -the \texttt{value} column. Following what we discussed until now, we -could do this by using the following source code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .pivot(}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(first(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+---+----+----+----+ -| date| id|Anne|Dani|Mike| -+----------+---+----+----+----+ -|2023-05-02| 2|NULL|NULL| 25| -|2023-05-03| 3|NULL| 18|NULL| -|2023-05-01| 1| 15|NULL|NULL| -+----------+---+----+----+----+ -\end{verbatim} - -However, there is a problem with this operation, because we lost some -observations in the process. The specific combination -\texttt{(\textquotesingle{}2023-05-02\textquotesingle{},\ \textquotesingle{}Mike\textquotesingle{})} -have three different values in the \texttt{value} column, which are -\texttt{25}, \texttt{34} and \texttt{21}. But there is only a single -cell in the new DataFrame (i.e.~the output of the pivot operation) to -hold these values. More specifically, the cell located at the first row -in the fifth column. - -In other words, Spark found three different values for a single cell (or -single space), and this is always a problem. Spark cannot simply put -three different values in a single cell. A Spark DataFrame just do not -work that way. Every cell in a Spark DataFrame should always hold a -single value, whatever that value is. - -That is the exact problem that an aggregating function solves in a pivot -operation. The aggregating function does the job of ensuring that a -single value will be mapped to every new cell created from the pivot -operation. Because an aggregating function is a function that aggregates -(or that summarises) a set of values into a single value. - -In the example above we used \texttt{first()} as our aggregating -function. So when Spark encountered the three values from the -combination -\texttt{(\textquotesingle{}2023-05-02\textquotesingle{},\ \textquotesingle{}Mike\textquotesingle{})}, -it simply picked the first value from the three. That is why we find the -value \texttt{25} at the first row in the fifth column. - -We can change this behaviour by changing the aggregating function -applied. In the example below, we are using \texttt{sum()}, and, as a -result, we get now the value of \texttt{80} (which is the sum of values -\texttt{25}, \texttt{34} and \texttt{21}) at the first row in the fifth -column. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import} \BuiltInTok{sum} -\NormalTok{df}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .pivot(}\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(}\BuiltInTok{sum}\NormalTok{(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+---+----+----+----+ -| date| id|Anne|Dani|Mike| -+----------+---+----+----+----+ -|2023-05-02| 2|NULL|NULL| 80| -|2023-05-03| 3|NULL| 18|NULL| -|2023-05-01| 1| 15|NULL|NULL| -+----------+---+----+----+----+ -\end{verbatim} - -Now, depending on the data and the structure from your DataFrame, Spark -might never encounter a situation where it finds two (or more) values -mapped to the same cell in a pivot operation. On this specific case, the -output from the pivot operation will likely be the same for many -different aggregating functions. In other words, the choice of the -aggregating function you want to use might be irrelevant over this -specific situation. - -For example, in the \texttt{longer\_religion} example that we showed -before, I could use the \texttt{last()} aggregating function (instead of -\texttt{first()}), and get the exact same result as before. Because in -this specific situation, Spark does not find any case of two (or more) -values mapped to the same cell in the output DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ last} -\NormalTok{longer\_religion}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}religion\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .pivot(}\StringTok{\textquotesingle{}salary\_range\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(last(}\StringTok{\textquotesingle{}avg\_salary\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------+---------+---------+-----+ -|religion|$10k-$20k|$20k-$30k|<$10k| -+--------+---------+---------+-----+ -|Agnostic| 34| 60| 27| -|Buddhist| 21| 30| 27| -| Atheist| 27| 37| 12| -+--------+---------+---------+-----+ -\end{verbatim} - -\section{Collecting and explode -operations}\label{collecting-and-explode-operations} - -You can retract or extend vertically your DataFrame, by nesting multiple -rows into a single row, or, the inverse, which is unnesting (or -exploding) one single row into multiple rows. The R \texttt{tidyverse} -framework is probably the only data framework that have a defined name -for this kind of operation, which are called ``nesting'' and -``unnesting''. - -In Spark, you perform this kind of operations by using the -\texttt{collect\_list()}, \texttt{collect\_set()} and \texttt{explode()} -functions, which all comes from the \texttt{pyspark.sql.functions} -module. The \texttt{collect\_list()} and \texttt{collect\_set()} -functions are used for retracting (or nesting) your DataFrame, while the -\texttt{explode()} function is used for extending (or unnesting). - -As a quick comparison, \texttt{explode()} is very similar to the R -function -\href{https://tidyr.tidyverse.org/reference/unnest_longer.html}{\texttt{tidyr::-unnest\_longer()}}\footnote{\url{https://tidyr.tidyverse.org/reference/unnest_longer.html}} -from the \texttt{tidyverse} framework, and also, similar to the Python -method -\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html}{\texttt{explode()}}\footnote{\url{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.explode.html}} -in the \texttt{pandas} framework. In the other hand, -\texttt{collect\_list()} and \texttt{collect\_set()} functions does a -similar job to the R function -\href{https://tidyr.tidyverse.org/reference/nest.html}{\texttt{tidyr::nest()}}\footnote{\url{https://tidyr.tidyverse.org/reference/nest.html}}. - -\subsection{\texorpdfstring{Expanding (or unnesting) with -\texttt{explode()}}{Expanding (or unnesting) with explode()}}\label{expanding-or-unnesting-with-explode} - -As an example, we have the \texttt{employees} DataFrame below. Each row -in this DataFrame describes an employee. The \texttt{knowledge} column -describes which programming language the employee have experience with, -and, the \texttt{employee\_attrs} column contains dictionaries with -general attributes of each employee (such it's department and it's -name). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{1}\NormalTok{, [}\StringTok{"R"}\NormalTok{, }\StringTok{"Python"}\NormalTok{], \{}\StringTok{\textquotesingle{}dep\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}PR\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}Anne\textquotesingle{}}\NormalTok{\}),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, [}\StringTok{"Scala"}\NormalTok{], \{}\StringTok{\textquotesingle{}dep\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}PM\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}Mike\textquotesingle{}}\NormalTok{\}),} -\NormalTok{ (}\DecValTok{3}\NormalTok{, [}\StringTok{"Java"}\NormalTok{, }\StringTok{"Python"}\NormalTok{], \{}\StringTok{\textquotesingle{}dep\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}HF\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}name\textquotesingle{}}\NormalTok{: }\StringTok{\textquotesingle{}Sam\textquotesingle{}}\NormalTok{\})} -\NormalTok{]} -\NormalTok{columns }\OperatorTok{=}\NormalTok{ [}\StringTok{"employee\_id"}\NormalTok{, }\StringTok{"knowledge"}\NormalTok{, }\StringTok{"employee\_attrs"}\NormalTok{]} -\NormalTok{employees }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, columns)} -\NormalTok{employees.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+--------------+--------------------+ -|employee_id| knowledge| employee_attrs| -+-----------+--------------+--------------------+ -| 1| [R, Python]|{name -> Anne, de...| -| 2| [Scala]|{name -> Mike, de...| -| 3|[Java, Python]|{name -> Sam, dep...| -+-----------+--------------+--------------------+ -\end{verbatim} - -You can use the \texttt{printSchema()} method that we introduced at -Section~\ref{sec-dataframe-schema} to see the schema of the DataFrame. -You can see in the result below that \texttt{knowledge} is a column of -arrays of strings (i.e.~\texttt{ArrayType}), while -\texttt{employee\_attrs} is a column of maps (i.e.~\texttt{MapType}). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{employees.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- employee_id: long (nullable = true) - |-- knowledge: array (nullable = true) - | |-- element: string (containsNull = true) - |-- employee_attrs: map (nullable = true) - | |-- key: string - | |-- value: string (valueContainsNull = true) -\end{verbatim} - -Suppose you wanted to calculate the number of employees that have -experience in each programming language. To do that, it would be great -to transform the \texttt{knowledge} column into a column of strings. -Why? Because it would make the counting of the employees easier. - -In other words, if an employee knows, for example, 3 different -programming languages, it would be better to have 3 different rows that -references this same employee, instead of having a single row that -contains an array of three elements, like in the \texttt{employees} -DataFrame. We can transform the current DataFrame into this new format -by using the \texttt{explode()} function. - -When you apply the \texttt{explode()} function over a column of arrays, -this function will create a new row for each element in each array it -finds in the column. For example, the employee of ID 1 knows two -programming languages (R and Python). As a result, when we apply -\texttt{explode()} over the \texttt{knowledge} column, two rows are -created for the employee of ID 1. As you can see in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ explode} -\NormalTok{explode\_array }\OperatorTok{=}\NormalTok{ employees}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}employee\_id\textquotesingle{}}\NormalTok{,} -\NormalTok{ explode(}\StringTok{\textquotesingle{}knowledge\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{explode\_array.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+------+ -|employee_id| col| -+-----------+------+ -| 1| R| -| 1|Python| -| 2| Scala| -| 3| Java| -| 3|Python| -+-----------+------+ -\end{verbatim} - -On the other hand, instead of arrays, the behaviour of -\texttt{explode()} is slightly different when you apply it over a column -of maps, such as \texttt{employee\_attrs}. Because each element in a map -have two components: a key and a value. As a consequence, when you apply -\texttt{explode()} over a column of maps, each element in the map -generates two different rows, which are stored in two separated columns, -called \texttt{key} and \texttt{value}. - -Take the result below as an example. First, we have two new columns that -were not present before (\texttt{key} and \texttt{value}). Each row in -the \texttt{key} column represents the key for an element in the input -map. While the \texttt{value} column represents the values of those -elements. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{explode\_map }\OperatorTok{=}\NormalTok{ employees}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}employee\_id\textquotesingle{}}\NormalTok{,} -\NormalTok{ explode(}\StringTok{\textquotesingle{}employee\_attrs\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{explode\_map.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+----+-----+ -|employee_id| key|value| -+-----------+----+-----+ -| 1|name| Anne| -| 1| dep| PR| -| 2|name| Mike| -| 2| dep| PM| -| 3|name| Sam| -| 3| dep| HF| -+-----------+----+-----+ -\end{verbatim} - -This kind of output is powerful, specially with pivot operations, -because you can easily organize all the data found in a column of maps -into a series of new columns, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{explode\_map}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}employee\_id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .pivot(}\StringTok{\textquotesingle{}key\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(first(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+---+----+ -|employee_id|dep|name| -+-----------+---+----+ -| 1| PR|Anne| -| 2| PM|Mike| -| 3| HF| Sam| -+-----------+---+----+ -\end{verbatim} - -\subsection{\texorpdfstring{The different versions of -\texttt{explode()}}{The different versions of explode()}}\label{the-different-versions-of-explode} - -The \texttt{explode()} function have three brother functions, which are -\texttt{explode\_outer()}, \texttt{posexplode()} and -\texttt{posexplode\_outer()}. All of these functions have very small -differences between them. However, these differences might be -important/useful to you. - -First, the difference between \texttt{explode()} and -\texttt{explode\_outer()} is on the \texttt{null} values. If an array -contains a \texttt{null} value, \texttt{explode()} will ignore this null -value. In other words, \texttt{explode()} does not create a new row for -any \texttt{null} value. In contrast, \texttt{explode\_outer()} does -create a new row even if the value is \texttt{null}. - -For example, if \texttt{explode()} encounter an array with 5 elements -where 2 of them are null values, then, \texttt{explode()} will output 3 -new rows. The 2 null values in the input array are automatically -ignored. However, if \texttt{explode\_outer()} encountered the same -array, then, it would output 5 new rows, no matter which values are -inside this input array. - -Second, the difference between \texttt{explode()} and -\texttt{posexplode()}, is that \texttt{posexplode()} also returns the -index that identifies the position in the input array where each value -is. In other words, if we applied the \texttt{posexplode()} over the -\texttt{knowledge} column of \texttt{employee} DataFrame, we would get a -new column called \texttt{pos}. By looking at this column \texttt{pos}, -we could see that the value \texttt{"Python"} for the employee of ID 1, -was on the position of index 1 on the original array that the function -encountered in the original \texttt{knowledge} column. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ posexplode} -\NormalTok{employees}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}employee\_id\textquotesingle{}}\NormalTok{,} -\NormalTok{ posexplode(}\StringTok{\textquotesingle{}knowledge\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+---+------+ -|employee_id|pos| col| -+-----------+---+------+ -| 1| 0| R| -| 1| 1|Python| -| 2| 0| Scala| -| 3| 0| Java| -| 3| 1|Python| -+-----------+---+------+ -\end{verbatim} - -Third, as you can probably imagine, the function -\texttt{posexplode\_outer()} incorporates the two visions from the -previous brothers. So, not only \texttt{posexplode\_outer()} creates a -row for each element in the array, no matter if this element is a -\texttt{null} value or not, but it also returns the index that -identifies the position in the input array where each value is. - -\subsection{\texorpdfstring{Retracting (or nesting) with -\texttt{collect\_list()} and -\texttt{collect\_set()}}{Retracting (or nesting) with collect\_list() and collect\_set()}}\label{retracting-or-nesting-with-collect_list-and-collect_set} - -In resume, you use the \texttt{collect\_list()} and -\texttt{collect\_set()} functions to retract your DataFrame. That is, to -reduce the number of rows of the DataFrame, while keeping the same -amount of information. - -To do this you just aggregate your DataFrame, using the \texttt{agg()} -method, and, apply the \texttt{collect\_list()} or -\texttt{collect\_set()} function over the columns you want to retract -(or nest). You likely want to use the \texttt{groupby()} method as well -in this case, to perform an aggregation per group. - -Because if you do not define any group in this situation, you will -aggregate the entire DataFrame into a single value. This means, that you -would get as output, a new DataFrame with only a single row, and, all -rows from the input DataFrame would be condensed inside this single row -in the output DataFrame. - -In essence, what the \texttt{collect\_list()} function do is collect a -set of rows and store it in an array. The \texttt{collect\_set()} -function does the same thing, with the difference that it stores this -set of rows in a set (which is an array of unique values). In other -words, \texttt{collect\_set()} collects a set of rows, then, it removes -all duplicated rows in this set, then, it stores the remaining values in -an array. - -To demonstrate the \texttt{collect\_list()} and \texttt{collect\_set()} -functions, we can use the \texttt{supermarket\_sales} DataFrame that we -introduced at Section~\ref{sec-remove-duplicates} as an example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{supermarket\_sales.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+------------+--------+-----+ -|transaction_id|product_name|quantity|price| -+--------------+------------+--------+-----+ -| T001| Apple| 5| 1.2| -| T001| Apple| 5| 1.2| -| T002| Banana| 3| 0.8| -| T004| Mango| 2| 2.0| -| T004| Mango| 2| 2.0| -| T004| Mango| 2| 2.0| -| T005| Grapes| 1| 3.5| -| T006| Apple| 2| 1.2| -| T006| Apple| 1| 1.2| -| T007| Banana| 4| 0.8| -| T008| Apple| 3| 1.2| -+--------------+------------+--------+-----+ -\end{verbatim} - -Suppose you wanted a new DataFrame that had a single row for each -\texttt{transaction\_id} without losing any amount of information from -the above DataFrame. We could do this by using the -\texttt{collect\_list()} function, like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ collect\_list} -\NormalTok{supermarket\_sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}transaction\_id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(} -\NormalTok{ collect\_list(}\StringTok{\textquotesingle{}product\_name\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}product\_name\textquotesingle{}}\NormalTok{),} -\NormalTok{ collect\_list(}\StringTok{\textquotesingle{}quantity\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}quantity\textquotesingle{}}\NormalTok{),} -\NormalTok{ collect\_list(}\StringTok{\textquotesingle{}price\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}price\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+--------------------+---------+---------------+ -|transaction_id| product_name| quantity| price| -+--------------+--------------------+---------+---------------+ -| T001| [Apple, Apple]| [5, 5]| [1.2, 1.2]| -| T002| [Banana]| [3]| [0.8]| -| T004|[Mango, Mango, Ma...|[2, 2, 2]|[2.0, 2.0, 2.0]| -| T005| [Grapes]| [1]| [3.5]| -| T006| [Apple, Apple]| [2, 1]| [1.2, 1.2]| -| T007| [Banana]| [4]| [0.8]| -| T008| [Apple]| [3]| [1.2]| -+--------------+--------------------+---------+---------------+ -\end{verbatim} - -The expression -\texttt{groupby(\textquotesingle{}transaction\_id\textquotesingle{})} -ensures that we have (on the output DataFrame) a single row for each -unique value in the \texttt{transaction\_id} column. While the -\texttt{collect\_list()} function does the job of condensing all the -information of the remaining columns into a single row for each -\texttt{transaction\_id}. - -Now, if we use \texttt{collect\_set()} instead of -\texttt{collect\_list()}, we would get a slightly different result. -Because, as we described before, the \texttt{collect\_set()} function -removes all duplicated values found in the set of rows it collects. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ collect\_set} -\NormalTok{supermarket\_sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(}\StringTok{\textquotesingle{}transaction\_id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(} -\NormalTok{ collect\_set(}\StringTok{\textquotesingle{}product\_name\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}product\_name\textquotesingle{}}\NormalTok{),} -\NormalTok{ collect\_set(}\StringTok{\textquotesingle{}quantity\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}quantity\textquotesingle{}}\NormalTok{),} -\NormalTok{ collect\_set(}\StringTok{\textquotesingle{}price\textquotesingle{}}\NormalTok{).alias(}\StringTok{\textquotesingle{}price\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+------------+--------+-----+ -|transaction_id|product_name|quantity|price| -+--------------+------------+--------+-----+ -| T001| [Apple]| [5]|[1.2]| -| T002| [Banana]| [3]|[0.8]| -| T004| [Mango]| [2]|[2.0]| -| T005| [Grapes]| [1]|[3.5]| -| T006| [Apple]| [1, 2]|[1.2]| -| T007| [Banana]| [4]|[0.8]| -| T008| [Apple]| [3]|[1.2]| -+--------------+------------+--------+-----+ -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{Exporting data out of Spark}\label{sec-export} - -After you transform your DataFrame and generate the results you want, -you might need to actually export these results out of Spark, so you -can: - -\begin{itemize} -\tightlist -\item - send the exported data to an external API. -\item - send these results to your manager or client. -\item - send the exported data to an ingest process that feeds some database. -\end{itemize} - -\section{\texorpdfstring{The \texttt{write} object as the main -entrypoint}{The write object as the main entrypoint}}\label{the-write-object-as-the-main-entrypoint} - -Every Spark session you start has an built-in \texttt{read} object that -you can use to read data and import it into Spark (this object was -described at Section~\ref{sec-read-files}), and the same applies to -writing data out of Spark. That is, Spark also offers a \texttt{write} -object that you can use to write/output data out of Spark. - -But in contrast to the \texttt{read} object, which is avaiable trough -the \texttt{SparkSession} object (\texttt{spark}), this \texttt{write} -object is available trough the \texttt{write} method of any -\texttt{DataFrame} object. In other words, every DataFrame you create in -Spark has a built-in \texttt{write} object that you can use to -write/export the data present in this DataFrame out of Spark. - -As an example, let's use the \texttt{transf} DataFrame that I presented -at Chapter~\ref{sec-transforming-dataframes-part1}. The \texttt{write} -method of the \texttt{transf} DataFrame object is the main entrypoint to -all the facilities that Spark offers to write/export \texttt{transf}'s -data to somewhere else. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.write} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} - -\end{verbatim} - -This \texttt{write} object is very similar in structure to the -\texttt{read} object. Essentially, this write object have a collection -of \emph{write engines}. Each write engine is speciallized in writing -data into a specific file format. So you have an engine for CSV files, -another engine for JSON files, another for Parquet files, etc. - -Every \texttt{write} object have the following methods: - -\begin{itemize} -\tightlist -\item - \texttt{mode()}: set the mode of the write process. This affects how - the data will be written to the files, and how the process will - behaviour if exceptions (or erros) are raised during runtime. -\item - \texttt{option()}: set an option to be used in the write process. This - option might be specific to the write engine used, or, might be an - option that is global to the write process (i.e.~an option that does - not depend of the chosen engine). -\item - \texttt{csv()}: the write engine to export data to CSV files. -\item - \texttt{json()}: the write engine to export data to JSON files. -\item - \texttt{parquet()}: the write engine to export data to Parquet files. -\item - \texttt{orc()}: the write engine to export data to ORC files. -\item - \texttt{text()}: the write engine to export data to text files. -\item - \texttt{jdbc()}: saves the data of the current DataFrame into a - database using the JDBC API. -\end{itemize} - -\section{\texorpdfstring{Exporting the \texttt{transf} -DataFrame}{Exporting the transf DataFrame}}\label{sec-write-example} - -As a first example on how to export data out of Spark, I will export the -data from the \texttt{transf} DataFrame. Over the next sections, I will -cover individual aspects that influences this write/export process. You -should know and consider each of these individual aspects when exporting -your data. - -\subsection{Quick export to a CSV -file}\label{quick-export-to-a-csv-file} - -Lets begin with a quick example of exporting the Spark data to a CSV -file. For this job, we need to use the write engine for CSV files, which -is the \texttt{csv()} method from the write object. - -The \textbf{first (and main) argument to all write engines} available in -Spark is a path to a folder where you want to store the exported files. -This means that (whatever write engine you use) Spark will always write -the files (with the exported data) inside a folder. - -Spark needs to use a folder to write the data. Because it generates some -extra files during the process that serves as ``placeholders'' or as -``statuses''. That is why Spark needs to create a folder, to store all -of these different files together during the process. - -In the example below, I decided to write this data into a folder called -\texttt{transf\_export}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.write.csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Now, after I executed the above command, if I take a look at my current -working directory, I will see the \texttt{transf\_export} folder that -was created by Spark. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pathlib }\ImportTok{import}\NormalTok{ Path} -\NormalTok{current\_directory }\OperatorTok{=}\NormalTok{ Path(}\StringTok{"."}\NormalTok{)} -\NormalTok{folders\_in\_current\_directory }\OperatorTok{=}\NormalTok{ [} - \BuiltInTok{str}\NormalTok{(item)} - \ControlFlowTok{for}\NormalTok{ item }\KeywordTok{in}\NormalTok{ current\_directory.iterdir()} - \ControlFlowTok{if}\NormalTok{ item.is\_dir()} -\NormalTok{]} - -\BuiltInTok{print}\NormalTok{(folders\_in\_current\_directory)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['metastore_db', 'transf_export'] -\end{verbatim} - -And if I look inside this \texttt{transf\_export} folder I will see two -files. One is the placeholder file (\texttt{\_SUCCESS}), and the other, -is a CSV file containing the exported data (\texttt{part-*.csv}). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{export\_folder }\OperatorTok{=}\NormalTok{ Path(}\StringTok{"transf\_export"}\NormalTok{)} -\NormalTok{files }\OperatorTok{=}\NormalTok{ [}\BuiltInTok{str}\NormalTok{(x.name) }\ControlFlowTok{for}\NormalTok{ x }\KeywordTok{in}\NormalTok{ export\_folder.iterdir()]} -\BuiltInTok{print}\NormalTok{(files)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -['part-00000-a4ee2ff4-4b7f-499e-a904-cec8d524ac56-c000.csv', - '_SUCCESS'] -\end{verbatim} - -We can see this file structure by using the -\href{https://www.geeksforgeeks.org/tree-command-unixlinux/}{\texttt{tree} -command line utility}\footnote{\url{https://www.geeksforgeeks.org/tree-command-unixlinux/}} -to build a diagram of this file structure: - -\begin{verbatim} -Terminal$ tree transf_export -\end{verbatim} - -\begin{verbatim} -transf_export -├── part-00000-a4ee2ff4-4b7f-499e-a904-cec8d524ac56-c000.csv -└── _SUCCESS -\end{verbatim} - -\subsection{Setting the write mode}\label{setting-the-write-mode} - -You can set the mode of a write process by using the \texttt{mode()} -method. This ``mode of the write process'' affects specially the -behavior of the process when files for this particular DataFrame you -trying to export already exists in your file system. - -There are four write modes available in Spark: - -\begin{itemize} -\tightlist -\item - \texttt{append}: will append the exported data to existing files of - this specific DataFrame. -\item - \texttt{overwrite}: will overwrite the data inside existing files of - this specific DataFrame with the data that is being currently - exported. -\item - \texttt{error} or \texttt{errorifexists}: will throw an exception in - case already existing files for this specific DataFrame are found. -\item - \texttt{ignore}: silently ignore/abort this write operation in case - already existing files for this specific DataFrame are found. -\end{itemize} - -If we set the write mode to \texttt{overwrite}, this means that every -time we execute the command below, the files inside the folder -\texttt{transf\_export} are rewritten from scratch. Everytime we export -the data, the files \texttt{part-*} inside the folder are rewritten to -contain the most fresh data from \texttt{transf} DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"overwrite"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -However, if we set the write mode to \texttt{error}, and run the command -again, then an error will be raised to indicate that the folder -(\texttt{transf\_export}) where we are trying to write the files already -exists. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"error"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -AnalysisException: [PATH_ALREADY_EXISTS] -Path file:/home/pedro/Documentos/Projetos/Livros/Introd-pyspark/C -hapters/transf_export -already exists. Set mode as "overwrite" to overwrite the existing - path. -\end{verbatim} - -In contrast, if we set the write mode to \texttt{append}, then the -current data of transf is appended (or ``added'') to the folder -\texttt{transf\_export}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"append"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Now, if I take a look at the contents of the \texttt{transf\_export} -folder, I will see now two \texttt{part-*} files instead of just one. -Both files have the same size (around 218 kb) because they both contain -the same data, or the same lines from the \texttt{transf} DataFrame. - -\begin{verbatim} -Terminal$ tree transf_export -\end{verbatim} - -\begin{verbatim} -transf_export -├── part-00000-a4ee2ff4-4b7f-499e-a904-cec8d524ac56-c000.csv -├── part-00000-ffcc7487-fc60-403b-a815-a1dd56894062-c000.csv -└── _SUCCESS -\end{verbatim} - -This means that the data is currently duplicated inside the -\texttt{transf\_export} folder. We can see this duplication by looking -at the number of rows of the DataFrame contained inside -\texttt{transf\_export}. We can use \texttt{spark.read.load()} to -quickly load the contents of the \texttt{transf\_export} folder into a -new DataFrame, and use \texttt{count()} method to see the number of -rows. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.read.load(} - \StringTok{"transf\_export"}\NormalTok{,} - \BuiltInTok{format} \OperatorTok{=} \StringTok{"csv"}\NormalTok{,} -\NormalTok{ header }\OperatorTok{=} \VariableTok{False} -\NormalTok{)} -\NormalTok{df.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -4842 -\end{verbatim} - -The result above show us that the folder \texttt{transf\_export} -currently contains 4842 rows of data. This is the exact double of number -of rows in the \texttt{transf} DataFrame, which have 2421 rows. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.count()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -2421 -\end{verbatim} - -So, in resume, the difference between write mode \texttt{overwrite} and -\texttt{append}, is that \texttt{overwrite} causes Spark to erase the -contents of \texttt{transf\_export}, before it starts to write the -current data into the folder. This way, Spark exports the most recent -version of the data stored inside the DataFrame. In contrast, -\texttt{append} simply appends (or adds) new files to the folder -\texttt{transf\_export} with the most recent version of the data stored -inside the DataFrame. - -At Section~\ref{sec-sql-save-modes} (or more specifically, at -Figure~\ref{fig-save-table-modes}) we presented this difference -visually. So, in case you don't understood fully the difference between -these two write modes, you can comeback at -Section~\ref{sec-sql-save-modes} and check -Figure~\ref{fig-save-table-modes} to see if it clears your -understanding. OBS: save modes = write modes. - -\subsection{Setting write options}\label{setting-write-options} - -Each person might have different needs, and also, each file format (or -each write engine) have its particularities or advantages that you may -need to exploit. As a consequence, you might need to set some options to -customize the writing process to fit into your needs. - -You can set options for the write process using the \texttt{option()} -method of the write object. This method works with key value pairs. -Inside this method, you provide the a key that identifies the option you -want to set, and the value you want to give to this option. - -For CSV files, an option that is very popular is the \texttt{sep} -option, that corresponds to the separator character of the CSV. This is -a special character inside the CSV file that separates each column -field. - -As an example, if we wanted to build a CSV file which uses the semicolon -(\texttt{;} - which is the european standard for CSV files) as the -separator character, instead of the comma (\texttt{,} - which is the -american standard for CSV files), we just need to set the \texttt{sep} -option to \texttt{;}, like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"overwrite"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"sep"}\NormalTok{, }\StringTok{";"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Each file format (or each write engine) have different options that are -specific (or characteristic) to the file format itself. For example, -JSON and CSV files are text file formats, and because of that, one key -aspect to them is the encoding of the text that is being stored inside -these files. So both write engines for these file formats -(\texttt{csv()} and \texttt{json()}) have an option called -\texttt{encoding} that you can use to change the encoding being used to -write the data into these files. - -In the example below, we are asking Spark to write a CSV file using the -Latin1 encoding (ISO-8859-1). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"overwrite"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"encoding"}\NormalTok{, }\StringTok{"ISO{-}8859{-}1"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -Is worth mentioning that the \texttt{option()} method sets one option at -a time. So if you need to set various write options, you just stack -\texttt{option()} calls on top of each other. In each call, you set a -different option. Like in the example below where we are setting options -\texttt{sep}, \texttt{encoding} and \texttt{header}: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"overwrite"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"sep"}\NormalTok{, }\StringTok{";"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"encoding"}\NormalTok{, }\StringTok{"UTF{-}8"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .option(}\StringTok{"header"}\NormalTok{, }\VariableTok{True}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"transf\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -If you want to see the full list of options for each write engine, the -documentation of Spark have a table with the complete list of options -available at each write engine\footnote{\url{https://spark.apache.org/docs/latest/sql-data-sources-csv.html\#data-source-option}.}. - -\section{Number of partitions determines the number of files -generated}\label{sec-export-partition-coalesce} - -As I explained at Section~\ref{sec-dataframe-partitions}, every -DataFrame that exists in Spark is a \textbf{distributed} DataFrame, -meaning that this DataFrame is divided into multiple pieces (that we -call \emph{partitions}), and these pieces are spread across the nodes in -the Spark cluster. - -In other words, each machine that is present in the Spark cluster, -contains some partitions (or some pieces) of the total DataFrame. But -why we are discussing partitions here? Is because the number of -partitions of your DataFrame determines the number of files written by -Spark when you export the data using the \texttt{write} method. - -On the previous examples across Section~\ref{sec-write-example}, when we -exported the \texttt{transf} DataFrame into CSV files, only one single -CSV file was generated inside the \texttt{transf\_exported} folder. That -is because the \texttt{transf} DataFrame have only one single partition, -as the code below demonstrates: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.rdd.getNumPartitions()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -1 -\end{verbatim} - -That means that all the data from \texttt{transf} DataFrame is -concentrated into a single partition. Having that in mind, we could say -that Spark decided in this specific case to not actually distribute the -data of \texttt{transf}. Because all of its data is concentrated into -one single place. - -But what would happen if the \texttt{transf} DataFrame was splitted -across 5 different partitions? What would happen then? In that case, if -the \texttt{transf} DataFrame had 5 different partitions, and I ran the -command \texttt{transf.write.csv("transf\_export")} to export its data -into CSV files, then, 5 different CSV files would be written by Spark -inside the folder \texttt{transf\_export}. One CSV file for each -existing partition of the DataFrame. - -The same goes for any other file format, or any write engine that you -might use in Spark. Each file generated by the write process contains -the data from a specific partition of the DataFrame. - -\subsection{Avoid exporting too much data into a single -file}\label{avoid-exporting-too-much-data-into-a-single-file} - -Spark will always try to organize your DataFrame into a \emph{partition -distribution} that yields the best performance in any data processing. -Usually in production environments, we have huge amounts of data, and a -single partition distribution is rarely the case that yields the best -performance in these environments. - -That is why most existing Spark DataFrames in production environments -are splitted into multiple partitions across the Spark cluster. This -means that Spark DataFrames that are by default concentrated into one -single partition (like the \texttt{transf} DataFrame in the examples of -this book) are very, very rare to find in the production environments. - -As a consequence, if you really need to export your data into a single -static file in a production environment, you will likely need to: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - repartition your Spark DataFrame. That is, to reorganize the - partitions of this DataFrame, so that all of its data get concentrated - into a single partition. -\item - or you continue with the write process anyway, and then later, after - the write process is finished, you merge all of the generated files - together with some other tool, like \texttt{pandas}, or - \texttt{polars}, or the \texttt{tidyverse}. -\end{enumerate} - -The option 2 above is a little out of the scope of this book, so I will -not explain it further here. But if you really need to export all the -data from your Spark DataFrame into a single static file (whatever is -the file format you choose), and you choose to follow option 1, then, -you need to perform a repartition operation to concentrate all data from -your Spark DataFrame into a single partition. - -Is worth mentioning that \textbf{I strongly advise against this option -1}. Because option 1 may cause some serious bottlenecks in your data -pipeline, depending specially on the size of the DataFrame you are -trying to export. - -In more details, when you do not perform any repartition operation, that -is, when you just write your DataFrame as is, without touching in the -existing partitions, then, the write process is a narrow transformation, -as I explained at Section~\ref{sec-narrow-wide}. Because each partition -is exported into a single and separate file that is independent from the -others. - -This is really important, because narrow transformations are much more -predictable and are more easily scaled than wide transformations. As a -result, Spark tends to scale and perform better when dealing with narrow -transformations. - -However, when you do perform a repartition operation to concentrate all -the data into a single partition, then, three things happen: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\item - the write process becomes a wide transformation, because all - partitions needs to be merged together, and as a consequence, all - nodes in the cluster needs to send their data to a single place (which - is usually the driver node of the cluster). -\item - a high amount of partition shuffles can happen inside the cluster, and - if they do happen, then, depending on the amount of data that needs to - be ``shuffled'' accross the cluster, this may cause some serious - slowdown in the processing. -\item - depending on the size of all partitions merged together, the risks for - an ``out of memory'' error to be raised during the process scales - rapidly. -\end{enumerate} - -So you should be aware of these risks above, and always try to avoid -using the option 1. Actually, you should avoid as much as possible the -need to write all the data into a single static file! Is best for you to -just write the data using the default number of partitions that Spark -choose for your DataFrame. - -But anyway, if you really cannot avoid this need, and if you have, for -example, a \texttt{sales} DataFrame you want to export, and this -DataFrame contains 4 partitions: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sales.rdd.getNumPartitions()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -4 -\end{verbatim} - -And you want to perform a repartition operation over this DataFrame to -export its data into a single static file, you can do so by using the -\texttt{coalesce()} DataFrame method. Just provide the number 1 to this -method, and all of the partitions will be reorganized into a single -partition: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .coalesce(}\DecValTok{1}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .rdd}\OperatorTok{\textbackslash{}} -\NormalTok{ .getNumPartitions()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -1 -\end{verbatim} - -Having that in mind, the entire source code to export the DataFrame into -a single static file would be something like this: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{sales}\OperatorTok{\textbackslash{}} -\NormalTok{ .coalesce(}\DecValTok{1}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .write}\OperatorTok{\textbackslash{}} -\NormalTok{ .mode(}\StringTok{"overwrite"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(}\StringTok{"sales\_export"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\section{Transforming to a Pandas DataFrame as a way to export -data}\label{transforming-to-a-pandas-dataframe-as-a-way-to-export-data} - -In case you don't know about this, Spark offers an API that you can use -to quickly convert your Spark DataFrames into a \texttt{pandas} -DataFrame. This might be extremely useful for a number of reasons: - -\begin{itemize} -\tightlist -\item - your colleague might be much more familiar with \texttt{pandas}, and - work more productively with it than \texttt{pyspark}. -\item - you might need to feed this data into an existing data pipeline that - uses \texttt{pandas} extensively. -\item - with \texttt{pandas} you can easily export this data into Excel files - (\texttt{.xlsx})\footnote{Actually, there is a Spark plugin available - that is capable of exporting data from Spark directly into Excel - files. But you need to install this plugin separately, since it does - not come with Spark from the factory: - \url{https://github.com/crealytics/spark-excel}.}, which are not - easily available in Spark. -\end{itemize} - -To convert an existing Spark DataFrame into a \texttt{pandas} DataFrame, -all you need to do is to call the \texttt{toPandas()} method of your -Spark DataFrame, and you will get a \texttt{pandas} DataFrame as output, -like in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{as\_pandas\_df }\OperatorTok{=}\NormalTok{ transf.toPandas()} -\BuiltInTok{type}\NormalTok{(as\_pandas\_df)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -pandas.core.frame.DataFrame -\end{verbatim} - -But you should be careful with this method, because when you transform -your Spark DataFrame into a \texttt{pandas} DataFrame you eliminate the -distributed aspect of it. As a result, all the data from your DataFrame -needs to be loaded into a single place (which is usually the driver's -memory). - -Because of that, using this \texttt{toPandas()} method might cause very -similar issues as the ones discussed at -Section~\ref{sec-export-partition-coalesce}. In other words, you might -face the same slowdowns caused by doing a repartition to concentrate all -the data into a single partition. - -So, as the Spark documentation itself suggests, you should use this -\texttt{toPandas()} method only if you know that your DataFrame is small -enough to fit into the driver's memory. - -\section{\texorpdfstring{The \texttt{collect()} method as a way to -export -data}{The collect() method as a way to export data}}\label{the-collect-method-as-a-way-to-export-data} - -The \texttt{collect()} DataFrame method exports the DataFrame's data -from Spark into a Python native object, more specifically, into a normal -Python list. To some extent, this is a viable way to export data from -Spark. - -Because by making this data from Spark available as a normal/standard -Python object, many new possibilities become open for us. Such as: - -\begin{itemize} -\tightlist -\item - sending this data to another location via HTTP requests using the - \texttt{request} Python package. -\item - sending this data by email using the \texttt{email} built-in Python - package. -\item - sending this data by SFTP protocol with the \texttt{paramiko} Python - package. -\item - sending this data to a cloud storage, such as Amazon S3 (using the - \texttt{boto3} Python package). -\end{itemize} - -By having the DataFrame's data easily available to Python as a Python -list, we can do virtually anything with this data. We can use this data -in basically anything that Python is capable of doing. - -Just as a simple example, if I needed to send the \texttt{transf} data -to an fictitious endpoint using a \texttt{POST} HTTP request, the source -code would probably be something similar to this: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ requests} - -\NormalTok{dataframe\_rows }\OperatorTok{=}\NormalTok{ transf.collect()} - -\NormalTok{url }\OperatorTok{=} \StringTok{\textquotesingle{}https://example.com/api/v1/transf\textquotesingle{}} -\ControlFlowTok{for}\NormalTok{ row }\KeywordTok{in}\NormalTok{ dataframe\_rows:} -\NormalTok{ row\_as\_dict }\OperatorTok{=}\NormalTok{ row.asDict()} -\NormalTok{ requests.post(url, data }\OperatorTok{=}\NormalTok{ row\_as\_dict)} -\end{Highlighting} -\end{Shaded} - -\bookmarksetup{startatroot} - -\chapter{Tools for string manipulation}\label{sec-string-tools} - -Many of the world's data is represented (or stored) as text (or string -variables). As a consequence, is very important to know the tools -available to process and transform this kind of data, in any platform -you use. In this chapter, we will focus on these tools. - -Most of the functionality available in \texttt{pyspark} to process text -data comes from functions available at the -\texttt{pyspark.sql.functions} module. This means that processing and -transforming text data in Spark usually involves applying a function on -a column of a Spark DataFrame (by using DataFrame methods such as -\texttt{withColumn()} and \texttt{select()}). - -\section{\texorpdfstring{The \texttt{logs} -DataFrame}{The logs DataFrame}}\label{the-logs-dataframe} - -Over the next examples in this chapter, we will use the \texttt{logs} -DataFrame, which contains various log messages registered at a -fictitious IP adress. The data that represents this DataFrame is freely -available trough the \texttt{logs.json} file, which you can download -from the official repository of this book\footnote{\url{https://github.com/pedropark99/Introd-pyspark/tree/main/Data}}. - -Each line of this JSON file contains a message that was recorded by the -logger of a fictitious system. Each log message have three main parts, -which are: 1) the type of message (warning - \texttt{WARN}, information -- \texttt{INFO}, error - \texttt{ERROR}); 2) timestamp of the event; 3) -the content of the message. In the example below, we have an example of -message: - -\begin{quote} -{[}INFO{]}: 2022-09-05 03:35:01.43 Looking for workers at South America -region; -\end{quote} - -To import \texttt{logs.json} file into a Spark DataFrame, I can use the -following code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{path }\OperatorTok{=} \StringTok{\textquotesingle{}./../Data/logs.json\textquotesingle{}} -\NormalTok{logs }\OperatorTok{=}\NormalTok{ spark.read.json(path)} -\NormalTok{n\_truncate }\OperatorTok{=} \DecValTok{50} -\NormalTok{logs.show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+ -| ip| -+--------------+ -| 1.0.104.27 | -| 1.0.104.27 | -| 1.0.104.27 | -| 1.0.104.27 | -| 1.0.104.27 | -+--------------+ -only showing top 5 rows -... with 1 more - columns: messa -ge -\end{verbatim} - -By default, when we use the \texttt{show()} action to see the contents -of our Spark DataFrame, Spark will always truncate (or cut) any value in -the DataFrame that is more than 20 characters long. Since the logs -messages in the \texttt{logs.json} file are usually much longer than 20 -characters, I am using the \texttt{truncate} argument of \texttt{show()} -in the example above, to avoid this behaviour. - -By setting this argument to 50, I am asking Spark to truncate (or cut) -values at the 50th character (instead of the 20th). By doing this, you -(reader) can actually see a much more significant part of the logs -messages in the result above. - -\section{Changing the case of letters in a -string}\label{changing-the-case-of-letters-in-a-string} - -Probably the most basic string transformation that exists is to change -the case of the letters (or characters) that compose the string. That -is, to raise specific letters to upper-case, or reduce them to -lower-case, and vice-versa. - -As a first example, lets go back to the \texttt{logs} DataFrame, and try -to change all messages in this DataFrame to lower case, upper case and -title case, by using the \texttt{lower()}, \texttt{upper()}, and -\texttt{initcap()} functions from the \texttt{pyspark.sql.functions} -module. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ lower,} -\NormalTok{ upper,} -\NormalTok{ initcap} -\NormalTok{)} - -\NormalTok{m }\OperatorTok{=}\NormalTok{ logs.select(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{)} -\CommentTok{\# Change to lower case:} -\NormalTok{m.withColumn(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, lower(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------------------------------------+ -| message| -+--------------------------------------------------+ -|[info]: 2022-09-05 03:35:01.43 looking for work...| -|[warn]: 2022-09-05 03:35:58.007 workers are una...| -|[info]: 2022-09-05 03:40:59.054 looking for wor...| -|[info]: 2022-09-05 03:42:24 3 workers were acqu...| -|[info]: 2022-09-05 03:42:37 initializing instan...| -+--------------------------------------------------+ -only showing top 5 rows -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Change to upper case:} -\NormalTok{m.withColumn(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, upper(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------------------------------------+ -| message| -+--------------------------------------------------+ -|[INFO]: 2022-09-05 03:35:01.43 LOOKING FOR WORK...| -|[WARN]: 2022-09-05 03:35:58.007 WORKERS ARE UNA...| -|[INFO]: 2022-09-05 03:40:59.054 LOOKING FOR WOR...| -|[INFO]: 2022-09-05 03:42:24 3 WORKERS WERE ACQU...| -|[INFO]: 2022-09-05 03:42:37 INITIALIZING INSTAN...| -+--------------------------------------------------+ -only showing top 5 rows -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# Change to title case} -\CommentTok{\# (first letter of each word is upper case):} -\NormalTok{m.withColumn(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, initcap(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------------------------------------+ -| message| -+--------------------------------------------------+ -|[info]: 2022-09-05 03:35:01.43 Looking For Work...| -|[warn]: 2022-09-05 03:35:58.007 Workers Are Una...| -|[info]: 2022-09-05 03:40:59.054 Looking For Wor...| -|[info]: 2022-09-05 03:42:24 3 Workers Were Acqu...| -|[info]: 2022-09-05 03:42:37 Initializing Instan...| -+--------------------------------------------------+ -only showing top 5 rows -\end{verbatim} - -\section{Calculating string length}\label{calculating-string-length} - -In Spark, you can use the \texttt{length()} function to get the length -(i.e.~the number of characters) of a string. In the example below, we -can see that the first log message is 74 characters long, while the -second log message have 112 characters. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ length} -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}length\textquotesingle{}}\NormalTok{, length(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+--------------------+------+ -| ip| message|length| -+--------------+--------------------+------+ -| 1.0.104.27 |[INFO]: 2022-09-0...| 74| -| 1.0.104.27 |[WARN]: 2022-09-0...| 112| -| 1.0.104.27 |[INFO]: 2022-09-0...| 75| -| 1.0.104.27 |[INFO]: 2022-09-0...| 94| -| 1.0.104.27 |[INFO]: 2022-09-0...| 65| -+--------------+--------------------+------+ -only showing top 5 rows -\end{verbatim} - -\section{Trimming or removing spaces from -strings}\label{trimming-or-removing-spaces-from-strings} - -The process of removing unnecessary spaces from strings is usually -called ``trimming''. In Spark, we have three functions that do this -process, which are: - -\begin{itemize} -\tightlist -\item - \texttt{trim()}: removes spaces from both sides of the string; -\item - \texttt{ltrim()}: removes spaces from the left side of the string; -\item - \texttt{rtrim()}: removes spaces from the right side of the string; -\end{itemize} - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ trim, rtrim, ltrim} -\NormalTok{)} - -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}ip\_trim\textquotesingle{}}\NormalTok{, trim(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}ip\_ltrim\textquotesingle{}}\NormalTok{, ltrim(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}ip\_rtrim\textquotesingle{}}\NormalTok{, rtrim(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+----------+------------+------------+ -| ip| ip_trim| ip_ltrim| ip_rtrim| -+--------------+----------+------------+------------+ -| 1.0.104.27 |1.0.104.27|1.0.104.27 | 1.0.104.27| -| 1.0.104.27 |1.0.104.27|1.0.104.27 | 1.0.104.27| -| 1.0.104.27 |1.0.104.27|1.0.104.27 | 1.0.104.27| -| 1.0.104.27 |1.0.104.27|1.0.104.27 | 1.0.104.27| -| 1.0.104.27 |1.0.104.27|1.0.104.27 | 1.0.104.27| -+--------------+----------+------------+------------+ -only showing top 5 rows -\end{verbatim} - -For the most part, I tend to remove these unnecessary strings when I -want to: 1) tidy the values; 2) avoid weird and confusing mistakes in -filters on my DataFrame. The second case is worth describing in more -details. - -Let's suppose you wanted to filter all rows from the \texttt{logs} -DataFrame where \texttt{ip} is equal to the \texttt{1.0.104.27} IP -adress. However, you can see in the result above, that I get nothing. -Not a single row of result. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{logs.}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \StringTok{"1.0.104.27"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-------+ -| ip|message| -+---+-------+ -+---+-------+ -\end{verbatim} - -But if you see the result of the previous example (where we appliead the -three versions of ``trim functions''), you know that this IP adress -\texttt{1.0.104.27} exists in the DataFrame. You know that the filter -above should find values for this IP adress. So why it did not find any -rows? - -The answer is these annoying (and hidden) spaces on both sides of the -values from the \texttt{ip} column. If we remove these unnecessary -spaces from the values of the \texttt{ip} column, we suddenly find the -rows that we were looking for. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{logs.}\BuiltInTok{filter}\NormalTok{(trim(col(}\StringTok{\textquotesingle{}ip\textquotesingle{}}\NormalTok{)) }\OperatorTok{==} \StringTok{"1.0.104.27"}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+--------------------+ -| ip| message| -+--------------+--------------------+ -| 1.0.104.27 |[INFO]: 2022-09-0...| -| 1.0.104.27 |[WARN]: 2022-09-0...| -| 1.0.104.27 |[INFO]: 2022-09-0...| -| 1.0.104.27 |[INFO]: 2022-09-0...| -| 1.0.104.27 |[INFO]: 2022-09-0...| -+--------------+--------------------+ -only showing top 5 rows -\end{verbatim} - -\section{Extracting substrings}\label{extracting-substrings} - -There are five main functions that we can use in order to extract -substrings of a string, which are: - -\begin{itemize} -\tightlist -\item - \texttt{substring()} and \texttt{substr()}: extract a single substring - based on a start position and the length (number of characters) of the - collected substring\footnote{Instead of using a zero based index - (which is the default for Python), these functions use a one based - index.}; -\item - \texttt{substring\_index()}: extract a single substring based on a - delimiter character\footnote{Instead of using a zero based index - (which is the default for Python), these functions use a one based - index.}; -\item - \texttt{split()}: extract one or multiple substrings based on a - delimiter character; -\item - \texttt{regexp\_extract()}: extracts substrings from a given string - that match a specified regular expression pattern; -\end{itemize} - -You can obviously extract a substring that matches a particular regex -(regular expression) as well, by using the \texttt{regexp\_extract()} -function. However, I will describe this function, and the regex -functionality available in \texttt{pyspark} at Section~\ref{sec-regex}, -or, more specifically, at Section~\ref{sec-regexp-extract}. For now, -just understand that you can also use regex to extract substrings from -your text data. - -\subsection{A substring based on a start position and -length}\label{a-substring-based-on-a-start-position-and-length} - -The \texttt{substring()} and \texttt{substr()} functions they both work -the same way. However, they come from different places. The -\texttt{substring()} function comes from the -\texttt{spark.sql.functions} module, while the \texttt{substr()} -function is actually a method from the \texttt{Column} class. - -One interesting aspect of these functions, is that they both use a -one-based index, instead of a zero-based index. This means that the -first character in the full string is identified by the index 1, instead -of the index 0. - -The first argument in both function is the index that identifies the -start position of the substring. If you set this argument to, let's say, -4, it means that the substring you want to extract starts at the 4th -character in the input string. - -The second argument is the amount of characters in the substring, or, in -other words, it's length. For example, if you set this argument to 10, -it means that the function will extract the substring that is formed by -walking \(10 - 1 = 9\) characters ahead from the start position you -specified at the first argument. We can also interpret this as: the -function will walk ahead on the string, from the start position, until -it gets a substring that is 10 characters long. - -In the example below, we are extracting the substring that starts at the -second character (index 2) and ends at the sixth character (index 6) in -the string. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col, substring} -\CommentTok{\# \textasciigrave{}df1\textasciigrave{} and \textasciigrave{}df2\textasciigrave{} are equal, because} -\CommentTok{\# they both mean the same thing} -\NormalTok{df1 }\OperatorTok{=}\NormalTok{ (logs} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}sub\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{).substr(}\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{))} -\NormalTok{)} - -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ (logs} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}sub\textquotesingle{}}\NormalTok{, substring(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\DecValTok{2}\NormalTok{, }\DecValTok{5}\NormalTok{))} -\NormalTok{)} - -\NormalTok{df2.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------+--------------------+-----+ -| ip| message| sub| -+--------------+--------------------+-----+ -| 1.0.104.27 |[INFO]: 2022-09-0...|INFO]| -| 1.0.104.27 |[WARN]: 2022-09-0...|WARN]| -| 1.0.104.27 |[INFO]: 2022-09-0...|INFO]| -| 1.0.104.27 |[INFO]: 2022-09-0...|INFO]| -| 1.0.104.27 |[INFO]: 2022-09-0...|INFO]| -+--------------+--------------------+-----+ -only showing top 5 rows -\end{verbatim} - -Just to be very clear on how \texttt{substring()} and \texttt{substr()} -both works. The Figure~\ref{fig-substring-start-length} illustrates the -result of the above code. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/substring-start-length.png} - -} - -\caption{\label{fig-substring-start-length}How \texttt{substring()} and -\texttt{substr()} works} - -\end{figure}% - -\subsection{A substring based on a -delimiter}\label{a-substring-based-on-a-delimiter} - -The \texttt{substring\_index()} function works very differently. It -collects the substring formed between the start of the string, and the -nth occurrence of a particular character. - -For example, if you ask \texttt{substring\_index()} to search for the -3rd occurrence of the character \texttt{\$} in your string, the function -will return to you the substring formed by all characters that are -between the start of the string until the 3rd occurrence of this -character \texttt{\$}. - -You can also ask \texttt{substring\_index()} to read backwards. That is, -to start the search on the end of the string, and move backwards in the -string until it gets to the 3rd occurrence of this character -\texttt{\$}. - -As an example, let's look at the 10th log message present in the -\texttt{logs} DataFrame. I used the \texttt{collect()} DataFrame method -to collect this message into a raw python string, so we can easily see -the full content of the message. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ monotonically\_increasing\_id} - -\NormalTok{mes\_10th }\OperatorTok{=}\NormalTok{ (} -\NormalTok{ logs} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}row\_id\textquotesingle{}}\NormalTok{,} -\NormalTok{ monotonically\_increasing\_id()} -\NormalTok{ )} -\NormalTok{ .where(col(}\StringTok{\textquotesingle{}row\_id\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{9}\NormalTok{)} -\NormalTok{)} - -\NormalTok{message }\OperatorTok{=}\NormalTok{ mes\_10th.collect()[}\DecValTok{0}\NormalTok{][}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{]} -\BuiltInTok{print}\NormalTok{(message)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -[INFO]: 2022-09-05 04:02:09.05 Libraries installed: pandas, - flask, numpy, spark_map, pyspark -\end{verbatim} - -We can see that this log message is listing a set of libraries that were -installed somewhere. Suppose you want to collect the first and the last -libraries in this list. How would you do it? - -A good start is to isolate the list of libraries from the rest of the -message. In other words, there is a bunch of characters in the start of -the log message, that we do not care about. So let's get rid of them. - -If you look closely to the message, you can see that the character -\texttt{:} appears twice whithin the message. One close to the start of -the string, and another time right before the start of the list of the -libraries. We can use this character as our first delimiter, to collect -the third substring that it creates within the total string, which is -the substring that contains the list of libraries. - -This first stage is presented visually at -Figure~\ref{fig-substring-delimiter1}. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/substring-delimiter1.png} - -} - -\caption{\label{fig-substring-delimiter1}Substrings produced by the -\texttt{:} delimiter character} - -\end{figure}% - -Now that we identified the substrings produced by the ``delimiter -character'', we just need to understand better which index we need to -use in \texttt{substring\_index()} to get this third substring that we -want. The Figure~\ref{fig-substring-delimiter2} presents in a visual -manner how the count system of \texttt{substring\_index()} works. - -When you use a positive index, \texttt{substring\_index()} will count -the occurrences of the delimiter character from left to right. But, when -you use a negative index, the opposite happens. That is, -\texttt{substring\_index()} counts the occurrences of the delimiter -character from right to left. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/substring-delimiter2.png} - -} - -\caption{\label{fig-substring-delimiter2}The count system of -\texttt{substring\_index()}} - -\end{figure}% - -The index 1 represents the first substring that is before the the 1st -occurence of the delimiter (\texttt{{[}INFO{]}}). The index 2 represents -everything that is before the 2nd occurence of the delimiter -(\texttt{{[}INFO{]}:\ 2022-09-05\ 04:02.09.05\ Libraries\ installed}). -etc. - -In contrast, the index -1 represents everything that is after the 1st -occurence of the delimiter, couting from right to left -(\texttt{pandas,\ flask,\ numpy,\ spark\_map,\ pyspark}). The index -2 -represents everything that is after the 2nd occurence of the delimiter -(\texttt{2022-09-05\ 04:02.09.05\ Libraries\ installed:\ pandas,\ flask,\ numpy,\ spark\_map,\ pyspark}). -Again, couting from right to left. - -Having all these informations in mind, we can conclude that the -following code fit our first objective. Note that I applied the -\texttt{trim()} function over the result of \texttt{substring\_index()}, -to ensure that the result substring does not contain any unnecessary -spaces at both ends. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ substring\_index} -\NormalTok{mes\_10th }\OperatorTok{=}\NormalTok{ mes\_10th}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}list\_of\_libraries\textquotesingle{}}\NormalTok{,} -\NormalTok{ trim(substring\_index(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}:\textquotesingle{}}\NormalTok{, }\OperatorTok{{-}}\DecValTok{1}\NormalTok{))} -\NormalTok{ )} - -\NormalTok{mes\_10th.select(}\StringTok{\textquotesingle{}list\_of\_libraries\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------------------------------------+ -| list_of_libraries| -+----------------------------------------+ -|pandas, flask, numpy, spark_map, pyspark| -+----------------------------------------+ -\end{verbatim} - -\subsection{Forming an array of -substrings}\label{forming-an-array-of-substrings} - -Now is a good time to introduce the \texttt{split()} function, because -we can use it to extract the first and the last library from the list -libraries of stored at the \texttt{mes\_10th} DataFrame. - -Basically, this function also uses a delimiter character to cut the -total string into multiple pieces. However, this function stores these -multiple pieces (or multiple substrings) into an array of substrings. -With this strategy, we can now access each substring (or each piece of -the total string) individually. - -If we look again at the string that we stored at the -\texttt{list\_of\_libraries} column, we have a list of libraries, -separated by a comma. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{mes\_10th}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}list\_of\_libraries\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------------------------------------+ -| list_of_libraries| -+----------------------------------------+ -|pandas, flask, numpy, spark_map, pyspark| -+----------------------------------------+ -\end{verbatim} - -The comma character (\texttt{,}) plays an important role in this string, -by separating each value in the list. And we can use this comma -character as the delimiter inside \texttt{split()}, to get an array of -substrings. Each element of this array is one of the many libraries in -the list. The Figure~\ref{fig-string-split} presents this process -visually. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/string-split.png} - -} - -\caption{\label{fig-string-split}Building an array of substrings with -\texttt{split()}} - -\end{figure}% - -The code to make this process is very straightforward. In the example -below, the column \texttt{array\_of\_libraries} becomes a column of data -type \texttt{ArrayType(StringType)}, that is, an array of string values. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ split} -\NormalTok{mes\_10th }\OperatorTok{=}\NormalTok{ mes\_10th}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}array\_of\_libraries\textquotesingle{}}\NormalTok{,} -\NormalTok{ split(}\StringTok{\textquotesingle{}list\_of\_libraries\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}, \textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{mes\_10th}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}array\_of\_libraries\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------------------------------------+ -| array_of_libraries| -+------------------------------------------+ -|[pandas, flask, numpy, spark_map, pyspark]| -+------------------------------------------+ -\end{verbatim} - -By having this array of substring, we can very easily select a specific -element in this array, by using the \texttt{getItem()} column method, -or, by using the open brackets as you would normally use to select an -element in a python list. - -You just need to give the index of the element you want to select, like -in the example below that we select the first and the fifth libraries in -the array. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{mes\_10th}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}lib\_1\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}array\_of\_libraries\textquotesingle{}}\NormalTok{)[}\DecValTok{0}\NormalTok{])}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}lib\_5\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}array\_of\_libraries\textquotesingle{}}\NormalTok{).getItem(}\DecValTok{4}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}lib\_1\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}lib\_5\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------+-------+ -| lib_1| lib_5| -+------+-------+ -|pandas|pyspark| -+------+-------+ -\end{verbatim} - -\section{Concatenating multiple strings -together}\label{concatenating-multiple-strings-together} - -Sometimes, we need to concatenate multiple strings together, to form a -single and longer string. To do this process, Spark offers two main -functions, which are: \texttt{concat()} and \texttt{concat\_ws()}. Both -of these functions receives a list of columns as input, and will perform -the same task, which is to concatenate the values of each column in the -list, sequentially. - -However, the \texttt{concat\_ws()} function have an extra argument -called \texttt{sep}, where you can define a string to be used as the -separator (or the ``delimiter'') between the values of each column in -the list. In some way, this \texttt{sep} argument and the -\texttt{concat\_ws()} function works very similarly to the -\href{https://docs.python.org/3/library/stdtypes.html\#str.join}{\texttt{join()} -string method of python}\footnote{\url{https://docs.python.org/3/library/stdtypes.html\#str.join}}. - -Let's comeback to the \texttt{penguins} DataFrame to demonstrate the use -of these functions: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{path }\OperatorTok{=} \StringTok{"../Data/penguins.csv"} -\NormalTok{penguins }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .csv(path, header }\OperatorTok{=} \VariableTok{True}\NormalTok{)} - -\NormalTok{penguins.select(}\StringTok{\textquotesingle{}species\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}island\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------+---------+------+ -|species| island| sex| -+-------+---------+------+ -| Adelie|Torgersen| male| -| Adelie|Torgersen|female| -| Adelie|Torgersen|female| -| Adelie|Torgersen| NULL| -| Adelie|Torgersen|female| -+-------+---------+------+ -only showing top 5 rows -\end{verbatim} - -Suppose you wanted to concatenate the values of the columns -\texttt{species}, \texttt{island} and \texttt{sex} together, and, store -these new values on a separate column. All you need to do is to list -these columns inside the \texttt{concat()} or \texttt{concat\_ws()} -function. - -If you look at the example below, you can see that I also used the -\texttt{lit()} function to add a underline character (\texttt{\_}) -between the values of each column. This is more verbose, because if you -needed to concatenate 10 columns together, and still add a ``delimiter -character'' (like the underline) between the values of each column, you -would have to write \texttt{lit(\textquotesingle{}\_\textquotesingle{})} -for 9 times on the list. - -In contrast, the \texttt{concat\_ws()} offers a much more succinct way -of expressing this same operation. Because the first argument of -\texttt{concat\_ws()} is the character to be used as the delimiter -between each column, and, after that, we have the list of columns to be -concatenated. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ concat,} -\NormalTok{ concat\_ws,} -\NormalTok{ lit} -\NormalTok{)} - -\NormalTok{penguins}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}using\_concat\textquotesingle{}}\NormalTok{,} -\NormalTok{ concat(} - \StringTok{\textquotesingle{}species\textquotesingle{}}\NormalTok{, lit(}\StringTok{\textquotesingle{}\_\textquotesingle{}}\NormalTok{), }\StringTok{\textquotesingle{}island\textquotesingle{}}\NormalTok{,} -\NormalTok{ lit(}\StringTok{\textquotesingle{}\_\textquotesingle{}}\NormalTok{), }\StringTok{\textquotesingle{}sex\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}using\_concat\_ws\textquotesingle{}}\NormalTok{,} -\NormalTok{ concat\_ws(} - \StringTok{\textquotesingle{}\_\textquotesingle{}}\NormalTok{, }\CommentTok{\# The delimiter character} - \StringTok{\textquotesingle{}species\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}island\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}sex\textquotesingle{}} \CommentTok{\# The list of columns} -\NormalTok{ )} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}using\_concat\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}using\_concat\_ws\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------------------+-----------------------+ -| using_concat| using_concat_ws| -+-----------------------+-----------------------+ -| Adelie_Torgersen_male| Adelie_Torgersen_male| -|Adelie_Torgersen_female|Adelie_Torgersen_female| -|Adelie_Torgersen_female|Adelie_Torgersen_female| -| NULL| Adelie_Torgersen| -|Adelie_Torgersen_female|Adelie_Torgersen_female| -+-----------------------+-----------------------+ -only showing top 5 rows -\end{verbatim} - -If you look closely to the result above, you can also see, that -\texttt{concat()} and \texttt{concat\_ws()} functions deal with null -values in different ways. If \texttt{concat()} finds a null value for a -particular row, in any of the listed columns to be concatenated, the end -result of the process is a null value for that particular row. - -On the other hand, \texttt{concat\_ws()} will try to concatenate as many -values as he can. If he does find a null value, he just ignores this -null value and go on to the next column, until it hits the last column -in the list. - -\section{Introducing regular expressions}\label{sec-regex} - -Spark also provides some basic regex (\emph{regular expressions}) -functionality. Most of this functionality is available trough two -functions that comes from the \texttt{pyspark.sql.functions} module, -which are: - -\begin{itemize} -\tightlist -\item - \texttt{regexp\_replace()}: replaces all occurrences of a specified - regular expression pattern in a given string with a replacement - string.; -\item - \texttt{regexp\_extract()}: extracts substrings from a given string - that match a specified regular expression pattern; -\end{itemize} - -There is also a column method that provides an useful way of testing if -the values of a column matchs a regular expression or not, which is the -\texttt{rlike()} column method. You can use the \texttt{rlike()} method -in conjunction with the \texttt{filter()} or \texttt{where()} DataFrame -methods, to find all values that fit (or match) a particular regular -expression, like we demonstrated at -Section~\ref{sec-filter-regex-pattern}. - -\subsection{The Java regular expression -standard}\label{the-java-regular-expression-standard} - -At this point, is worth remembering a basic fact about Apache Spark that -we introduced at Chapter~\ref{sec-introd-spark}. Apache Spark is written -in Scala, which is a modern programming language deeply connected with -the Java programming language. One of the many consequences from this -fact, is that all regular expression functionality available in Apache -Spark is based on the Java \texttt{java.util.regex} package. - -This means that you should always write regular expressions on your -\texttt{pyspark} code that follows the Java regular expression syntax, -and not the Python regular expression syntax, which is based on the -python module \texttt{re}. - -Although this detail is important, these two flavors of regular -expressions (Python syntax versus Java syntax) are very, very similar. -So, for the most part, you should not see any difference between these -two syntaxes. - -If for some reason, you need to consult the full list of all -metacharacters available in the Java regular expression standard, you -can always check the Java documentation for the \texttt{java.util.regex} -package. More specifically, the -\href{https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html}{documentation -for the \texttt{java.util.regex.Pattern} class}\footnote{\url{https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html}}. - -The following list gives you a quick description of a small fraction of -the available metacharacters in the Java syntax, and, as a result, -metacharacters that you can use in \texttt{pyspark}: - -\begin{itemize} -\tightlist -\item - \texttt{.} : Matches any single character; -\item - \texttt{*} : Matches zero or more occurrences of the preceding - character or pattern; -\item - \texttt{+} : Matches one or more occurrences of the preceding - character or pattern; -\item - \texttt{?} : Matches zero or one occurrence of the preceding character - or pattern; -\item - \texttt{\textbar{}} : Matches either the expression before or after - the \texttt{\textbar{}}; -\item - \texttt{{[}{]}} : Matches any single character within the brackets; -\item - \texttt{\textbackslash{}d} : Matches any digit character; -\item - \texttt{\textbackslash{}b} : Matches a word boundary character; -\item - \texttt{\textbackslash{}w} : Matches any word character. Equivalent to - the \texttt{"\textbackslash{}b({[}a-zA-Z\_0-9{]}+)\textbackslash{}b"} - regular expression; -\item - \texttt{\textbackslash{}s} : Matches any whitespace character; -\item - \texttt{()} : Groups a series of pattern elements to a single element; -\end{itemize} - -\subsection{Using an invalid regular -expression}\label{using-an-invalid-regular-expression} - -When you write an invalid regular expression in your code, Spark usually -complains with a \texttt{java.util.regex.PatternSyntaxException} runtime -error. The code presented below is an example of code that produces such -error. - -In this example, the regular expression -\texttt{\textbackslash{}b({[}a-z{]}} is invalid because it is missing a -closing parenthesis. If you try to execute this code, Spark will raise a -with the message ``Unclosed group near index 7''. This error message -indicates that there is a syntax error in the regular expression, due to -an unclosed group (i.e., a missing closing parenthesis). - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{weird\_regex }\OperatorTok{=} \StringTok{\textquotesingle{}}\CharTok{\textbackslash{}b}\StringTok{([a{-}z]\textquotesingle{}} -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{).rlike(weird\_regex))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -Py4JJavaError: An error occurred while calling o261.showString. -: java.util.regex.PatternSyntaxException: Unclosed group near i -ndex 7 -([a-z] -\end{verbatim} - -To avoid these runtime errors, due to invalid regular expressions, is -always a good idea to test your regular expressions, before you use them -in your \texttt{pyspark} code. You can easily test your regular -expressions by using online tools, such as the -\href{https://regex101.com/}{Regex101 website}\footnote{\url{https://regex101.com/}}. - -\subsection{\texorpdfstring{Replacing occurrences of a particular -regular expression with -\texttt{regexp\_replace()}}{Replacing occurrences of a particular regular expression with regexp\_replace()}}\label{sec-regexp-replace} - -One of the most essential actions with regular expression is to find -text that fits into a particular regular expression, and, rewriting this -text into a different format, or, even removing it completely from the -string. - -The \texttt{regexp\_replace()} function (from the -\texttt{pyspark.sql.functions} module) is the function that allows you -to perform this kind of operation on string values of a column in a -Spark DataFrame. - -This function replaces all occurrences of a specified regular expression -pattern in a given string with a replacement string, and it takes three -different arguments: - -\begin{itemize} -\tightlist -\item - The input column name or expression that contains the string values to - be modified; -\item - The regular expression pattern to search for within the input string - values; -\item - The replacement string that will replace all occurrences of the - matched pattern in the input string values; -\end{itemize} - -As an example, lets suppose we want to remove completely the type of the -message in all log messages present in the \texttt{logs} DataFrame. To -that, we first need to get a regular expression capable of identifying -all possibilities for these types. - -A potential candidate would be the regular expression -\texttt{\textquotesingle{}\textbackslash{}\textbackslash{}{[}(INFO\textbar{}ERROR\textbar{}WARN)\textbackslash{}\textbackslash{}{]}:\ \textquotesingle{}}, -so lets give it a shot. Since we are trying to \textbf{remove} this -particular part from all log messages, we should replace this part of -the string by an empty string -(\texttt{\textquotesingle{}\textquotesingle{}}), like in the example -below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ regexp\_replace} - -\NormalTok{type\_regex }\OperatorTok{=} \StringTok{\textquotesingle{}}\CharTok{\textbackslash{}\textbackslash{}}\StringTok{[(INFO|ERROR|WARN)}\CharTok{\textbackslash{}\textbackslash{}}\StringTok{]: \textquotesingle{}} - -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}without\_type\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_replace(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, type\_regex, }\StringTok{\textquotesingle{}\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}without\_type\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=} \DecValTok{30}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------------------------+------------------------------+ -| message| without_type| -+------------------------------+------------------------------+ -|[INFO]: 2022-09-05 03:35:01...|2022-09-05 03:35:01.43 Look...| -|[WARN]: 2022-09-05 03:35:58...|2022-09-05 03:35:58.007 Wor...| -|[INFO]: 2022-09-05 03:40:59...|2022-09-05 03:40:59.054 Loo...| -|[INFO]: 2022-09-05 03:42:24...|2022-09-05 03:42:24 3 Worke...| -|[INFO]: 2022-09-05 03:42:37...|2022-09-05 03:42:37 Initial...| -|[WARN]: 2022-09-05 03:52:02...|2022-09-05 03:52:02.98 Libr...| -|[INFO]: 2022-09-05 04:00:33...|2022-09-05 04:00:33.210 Lib...| -|[INFO]: 2022-09-05 04:01:15...|2022-09-05 04:01:15 All clu...| -|[INFO]: 2022-09-05 04:01:35...|2022-09-05 04:01:35.022 Mak...| -|[INFO]: 2022-09-05 04:02:09...|2022-09-05 04:02:09.05 Libr...| -|[INFO]: 2022-09-05 04:02:09...|2022-09-05 04:02:09.05 The ...| -|[INFO]: 2022-09-05 04:02:09...|2022-09-05 04:02:09.05 An e...| -|[ERROR]: 2022-09-05 04:02:1...|2022-09-05 04:02:12 A task ...| -|[ERROR]: 2022-09-05 04:02:3...|2022-09-05 04:02:34.111 Err...| -|[ERROR]: 2022-09-05 04:02:3...|2022-09-05 04:02:34.678 Tra...| -|[ERROR]: 2022-09-05 04:02:3...|2022-09-05 04:02:35.14 Quit...| -+------------------------------+------------------------------+ -\end{verbatim} - -Is useful to remind that this \texttt{regexp\_replace()} function -searches for \textbf{all occurrences} of the regular expression on the -input string values, and replaces all of these occurrences by the input -replacement string that you gave. However, if the function does not find -any matchs for your regular expression inside a particular value in the -column, then, the function simply returns this value intact. - -\subsection{\texorpdfstring{Introducing capturing groups on -\texttt{pyspark}}{Introducing capturing groups on pyspark}}\label{introducing-capturing-groups-on-pyspark} - -One of the many awesome functionalities of regular expressions, is the -capability of enclosing parts of a regular expression inside groups, and -actually store (or cache) the substring matched by this group. This -process of grouping parts of a regular expression inside a group, and -capturing substrings with them, is usually called of ``grouping and -capturing''. - -Is worth pointing out that this capturing groups functionality is -available both in \texttt{regexp\_replace()} and -\texttt{regexp\_extract()}. - -\subsubsection{What is a capturing group -?}\label{what-is-a-capturing-group} - -Ok, but, what is this group thing? You create a group inside a regular -expression by enclosing a particular section of your regular expression -inside a pair of parentheses. The regular expression that is written -inside this pair of of parentheses represents a capturing group. - -A capturing group inside a regular expression is used to capture a -specific part of the matched string. This means that the actual part of -the input string that is matched by the regular expression that is -inside this pair of parentheses, is captured (or cached, or saved) by -the group, and, can be reused later. - -\begin{quote} -Besides grouping part of a regular expression together, parentheses also -create a numbered capturing group. It stores the part of the string -matched by the part of the regular expression inside the parentheses. -\ldots. The regex ``Set(Value)?'' matches ``Set'' or ``SetValue''. In -the first case, the first (and only) capturing group remains empty. In -the second case, the first capturing group matches ``Value''. (Goyvaerts -2023). -\end{quote} - -So, remember, to use capturing groups in a regular expression, you must -enclose the part of the pattern that you want to capture in parentheses -\texttt{()}. Each set of parentheses creates a new capturing group. This -means that you can create multiple groups inside a single regular -expression, and, then, reuse latter the substrings captured by all of -these multiple groups. Awesome, right? - -Each new group (that is, each pair of parentheses) that you create in -your regular expression have a different index. That means that the -first group is identified by the index 1, the second group, by the index -2, the third group, by the index 3, etc. - -Just to quickly demonstrate these capturing groups, here is a quick -example, in pure Python: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{import}\NormalTok{ re} - -\CommentTok{\# A regular expression that contains} -\CommentTok{\# three different capturing groups} -\NormalTok{regex }\OperatorTok{=} \VerbatimStringTok{r"(\textbackslash{}d}\SpecialCharTok{\{3\}}\VerbatimStringTok{){-}(\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{){-}(\textbackslash{}d}\SpecialCharTok{\{4\}}\VerbatimStringTok{)"} - -\CommentTok{\# Match the regular expression against a string} -\NormalTok{text }\OperatorTok{=} \StringTok{"My social security number is 123{-}45{-}6789."} -\NormalTok{match }\OperatorTok{=}\NormalTok{ re.search(regex, text)} - -\CommentTok{\# Access the captured groups} -\NormalTok{group1 }\OperatorTok{=}\NormalTok{ match.group(}\DecValTok{1}\NormalTok{) }\CommentTok{\# "123"} -\NormalTok{group2 }\OperatorTok{=}\NormalTok{ match.group(}\DecValTok{2}\NormalTok{) }\CommentTok{\# "45"} -\NormalTok{group3 }\OperatorTok{=}\NormalTok{ match.group(}\DecValTok{3}\NormalTok{) }\CommentTok{\# "6789"} -\end{Highlighting} -\end{Shaded} - -In the above example, the regular expression -\texttt{r"(\textbackslash{}d\{3\})-(\textbackslash{}d\{2\})-(\textbackslash{}d\{4\})"} -contains three capturing groups, each enclosed in parentheses. When the -regular expression is matched against the string -\texttt{"My\ social\ security\ number\ is\ 123-45-6789."}, the first -capturing group matches the substring \texttt{"123"}, the second -capturing group matches \texttt{"45"}, and the third capturing group -matches \texttt{"6789"}. - -\begin{figure} - -\centering{ - -\includegraphics{Chapters/./../Figures/substring-capturing-groups.png} - -} - -\caption{\label{fig-substring-capturing-groups}Example of capturing -groups} - -\end{figure}% - -In Python, we can access the captured groups using the \texttt{group()} -method of the \texttt{Match} object returned by \texttt{re.search()}. In -this example, \texttt{match.group(1)} returns the captured substring of -the first capturing group (which is \texttt{"123"}), -\texttt{match.group(2)} returns second \texttt{"45"}, and -\texttt{match.group(3)} returns \texttt{"6789"}. - -\subsubsection{\texorpdfstring{How can we use capturing groups in -\texttt{pyspark} -?}{How can we use capturing groups in pyspark ?}}\label{how-can-we-use-capturing-groups-in-pyspark} - -Ok, now that we understood what capturing groups is, how can we use them -in \texttt{pypspark}? First, remember, capturing groups will be -available to you, only if you enclose a part of your regular expression -in a pair of parentheses. So the first part is to make sure that the -capturing groups are present in your regular expressions. - -After that, you can access the substring matched by the capturing group, -by using the reference index that identifies this capturing group you -want to use. In pure Python, we used the \texttt{group()} method with -the group index (like 1, 2, etc.) to access these values. - -But in \texttt{pyspark}, we access these groups by using a special -pattern formed by the group index preceded by a dollar sign -(\texttt{\$}). That is, the text \texttt{\$1} references the first -capturing group, \texttt{\$2} references the second capturing group, -etc. - -As a first example, lets go back to the regular expression we used at -Section~\ref{sec-regexp-replace}: -\texttt{\textbackslash{}\textbackslash{}{[}(INFO\textbar{}ERROR\textbar{}WARN)\textbackslash{}\textbackslash{}{]}:}. -This regular expression contains one capturing group, which captures the -type label of the log message: -\texttt{(INFO\textbar{}ERROR\textbar{}WARN)}. - -If we use the special pattern \texttt{\$1} to reference this capturing -group inside of \texttt{regexp\_replace()}, what is going to happen is: -\texttt{regexp\_replace()} will replace all occurrences of the input -regular expression found on the input string, by the substring matched -by the first capturing group. See in the example below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}using\_groups\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_replace(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, type\_regex, }\StringTok{\textquotesingle{}Type Label {-}\textgreater{} $1 | \textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}using\_groups\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=} \DecValTok{30}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------------------------+------------------------------+ -| message| using_groups| -+------------------------------+------------------------------+ -|[INFO]: 2022-09-05 03:35:01...|Type Label -> INFO | 2022-0...| -|[WARN]: 2022-09-05 03:35:58...|Type Label -> WARN | 2022-0...| -|[INFO]: 2022-09-05 03:40:59...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 03:42:24...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 03:42:37...|Type Label -> INFO | 2022-0...| -|[WARN]: 2022-09-05 03:52:02...|Type Label -> WARN | 2022-0...| -|[INFO]: 2022-09-05 04:00:33...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 04:01:15...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 04:01:35...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 04:02:09...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 04:02:09...|Type Label -> INFO | 2022-0...| -|[INFO]: 2022-09-05 04:02:09...|Type Label -> INFO | 2022-0...| -|[ERROR]: 2022-09-05 04:02:1...|Type Label -> ERROR | 2022-...| -|[ERROR]: 2022-09-05 04:02:3...|Type Label -> ERROR | 2022-...| -|[ERROR]: 2022-09-05 04:02:3...|Type Label -> ERROR | 2022-...| -|[ERROR]: 2022-09-05 04:02:3...|Type Label -> ERROR | 2022-...| -+------------------------------+------------------------------+ -\end{verbatim} - -In essence, you can reuse the substrings matched by the capturing -groups, by using the special patterns \texttt{\$1}, \texttt{\$2}, -\texttt{\$3}, etc. This means that you can reuse the substrings captured -by multiple groups at the same time inside \texttt{regexp\_replace()} -and \texttt{regexp\_extract()}. For example, if we use the replacement -string \texttt{"\$1,\ \$2,\ \$3"} inside \texttt{regexp\_replace()}, we -would get the substrings matched by the first, second and third -capturing groups, separated by commas. - -However, is also good to emphasize a small limitation that this system -has. When you need to reuse the substrings captured by multiple groups -together, is important that you make sure to add some amount of space -(or some delimiter character) between each group reference, like -\texttt{"\$1\ \$2\ \$3"}. - -Because if you write these group references one close to each other -(like in \texttt{"\$1\$2\$3"}), it is not going to work. In other words, -Spark will not understand that you are trying to access a capturing -group. It will interpret the text \texttt{"\$1\$2\$3"} as the literal -value \texttt{"\$1\$2\$3"}, and not as a special pattern that references -multiple capturing groups in the regular expression. - -\subsection{\texorpdfstring{Extracting substrings with -\texttt{regexp\_extract()}}{Extracting substrings with regexp\_extract()}}\label{sec-regexp-extract} - -Another very useful regular expression activity is to extract a -substring from a given string that match a specified regular expression -pattern. The \texttt{regexp\_extract()} function is the main method used -to do this process. - -This function takes three arguments, which are: - -\begin{itemize} -\tightlist -\item - The input column name or expression that contains the string to be - searched; -\item - The regular expression pattern to search for within the input string; -\item - The index of the capturing group within the regular expression pattern - that corresponds to the substring to extract; -\end{itemize} - -You may (or may not) use capturing groups inside of -\texttt{regexp\_replace()}. However, on the other hand, the -\texttt{regexp\_extract()} function \textbf{is based on} the capturing -groups functionality. As a consequence, when you use -\texttt{regexp\_extract()}, you must give a regular expression that -\textbf{contains some capturing group}. Because otherwise, the -\texttt{regexp\_extract()} function becomes useless. - -In other words, the \texttt{regexp\_extract()} function extracts -substrings that are matched by the capturing groups present in your -input regular expression. If you want, for example, to use -\texttt{regexp\_extract()} to extract the substring matched by a entire -regular expression, then, you just need to surround this entire regular -expression by a pair of parentheses. This way you transform this entire -regular expression in a capturing group, and, therefore, you can extract -the substring matched by this group. - -As an example, lets go back again to the regular expression we used in -the \texttt{logs} DataFrame: -\texttt{\textbackslash{}\textbackslash{}{[}(INFO\textbar{}ERROR\textbar{}WARN)\textbackslash{}\textbackslash{}{]}:}. -We can extract the type of log message label, by using the index 1 to -reference the first (and only) capturing group in this regular -expression. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ regexp\_extract} - -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}message\_type\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_extract(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, type\_regex, }\DecValTok{1}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}message\_type\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(truncate }\OperatorTok{=} \DecValTok{30}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------------------------+------------+ -| message|message_type| -+------------------------------+------------+ -|[INFO]: 2022-09-05 03:35:01...| INFO| -|[WARN]: 2022-09-05 03:35:58...| WARN| -|[INFO]: 2022-09-05 03:40:59...| INFO| -|[INFO]: 2022-09-05 03:42:24...| INFO| -|[INFO]: 2022-09-05 03:42:37...| INFO| -|[WARN]: 2022-09-05 03:52:02...| WARN| -|[INFO]: 2022-09-05 04:00:33...| INFO| -|[INFO]: 2022-09-05 04:01:15...| INFO| -|[INFO]: 2022-09-05 04:01:35...| INFO| -|[INFO]: 2022-09-05 04:02:09...| INFO| -|[INFO]: 2022-09-05 04:02:09...| INFO| -|[INFO]: 2022-09-05 04:02:09...| INFO| -|[ERROR]: 2022-09-05 04:02:1...| ERROR| -|[ERROR]: 2022-09-05 04:02:3...| ERROR| -|[ERROR]: 2022-09-05 04:02:3...| ERROR| -|[ERROR]: 2022-09-05 04:02:3...| ERROR| -+------------------------------+------------+ -\end{verbatim} - -As another example, lets suppose we wanted to extract not only the type -of the log message, but also, the timestamp and the content of the -message, and store these different elements in separate columns. - -To do that, we could build a more complete regular expression. An -expression capable of matching the entire log message, and, at the same -time, capture each of these different elements inside a different -capturing group. The code below is an example that produces such regular -expression, and applies it over the \texttt{logs} DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{type\_regex }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}\textbackslash{}[(INFO|ERROR|WARN)\textbackslash{}]: \textquotesingle{}} - -\NormalTok{date\_regex }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}\textbackslash{}d}\SpecialCharTok{\{4\}}\VerbatimStringTok{{-}\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{{-}\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{\textquotesingle{}} -\NormalTok{time\_regex }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{} \textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{:\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{:\textbackslash{}d}\SpecialCharTok{\{2\}}\VerbatimStringTok{([.]\textbackslash{}d+)?\textquotesingle{}} -\NormalTok{timestamp\_regex }\OperatorTok{=}\NormalTok{ date\_regex }\OperatorTok{+}\NormalTok{ time\_regex} -\NormalTok{timestamp\_regex }\OperatorTok{=} \VerbatimStringTok{r\textquotesingle{}(\textquotesingle{}} \OperatorTok{+}\NormalTok{ timestamp\_regex }\OperatorTok{+} \VerbatimStringTok{r\textquotesingle{})\textquotesingle{}} - -\NormalTok{regex }\OperatorTok{=}\NormalTok{ type\_regex }\OperatorTok{+}\NormalTok{ timestamp\_regex }\OperatorTok{+} \VerbatimStringTok{r\textquotesingle{}(.+)$\textquotesingle{}} - -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}message\_type\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_extract(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, regex, }\DecValTok{1}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timestamp\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_extract(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, regex, }\DecValTok{2}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}message\_content\textquotesingle{}}\NormalTok{,} -\NormalTok{ regexp\_extract(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, regex, }\DecValTok{4}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}message\_type\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}timestamp\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}message\_content\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=} \DecValTok{30}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-----------------------+ -|message_type| timestamp| -+------------+-----------------------+ -| INFO| 2022-09-05 03:35:01.43| -| WARN|2022-09-05 03:35:58.007| -| INFO|2022-09-05 03:40:59.054| -| INFO| 2022-09-05 03:42:24| -| INFO| 2022-09-05 03:42:37| -+------------+-----------------------+ -only showing top 5 rows -... with 1 more columns: message_cont -ent -\end{verbatim} - -\subsection{\texorpdfstring{Identifying values that match a particular -regular expression with -\texttt{rlike()}}{Identifying values that match a particular regular expression with rlike()}}\label{identifying-values-that-match-a-particular-regular-expression-with-rlike} - -The \texttt{rlike()} column method is useful for checking if a string -value in a column matches a specific regular expression. We briefly -introduced this method at Section~\ref{sec-filter-regex-pattern}. This -method has only one input, which is the regular expression you want to -apply over the column values. - -As an example, lets suppose you wanted to identify timestamp values -inside your strings. You could use a regular expression pattern to find -which text values had these kinds of values inside them. - -A possible regular expression candidate would be -\texttt{"{[}0-9{]}\{2\}:{[}0-9{]}\{2\}:{[}0-9{]}\{2\}({[}.{]}{[}0-9{]}+)?"}. -This regex matches timestamp values in the format ``hh:mm:ss.sss''. This -pattern consists of the following building blocks, or, elements: - -\begin{itemize} -\tightlist -\item - \texttt{{[}0-9{]}\{2\}}: Matches any two digits from 0 to 9. -\item - \texttt{:}: Matches a colon character. -\item - \texttt{({[}.{]}{[}0-9{]}+)?}: Matches an optional decimal point - followed by one or more digits. -\end{itemize} - -If we apply this pattern over all log messages stored in the -\texttt{logs} DataFrame, we would find that all logs messages matches -this particular regular expression. Because all log messages contains a -timestamp value at the start of the message: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} - -\NormalTok{pattern }\OperatorTok{=} \StringTok{"[0{-}9]}\SpecialCharTok{\{2\}}\StringTok{:[0{-}9]}\SpecialCharTok{\{2\}}\StringTok{:[0{-}9]}\SpecialCharTok{\{2\}}\StringTok{([.][0{-}9]+)?"} -\NormalTok{logs}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}does\_it\_match?\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{"message"}\NormalTok{).rlike(pattern)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}message\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}does\_it\_match?\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{, truncate }\OperatorTok{=}\NormalTok{ n\_truncate)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------------------------------------+ -| message| -+--------------------------------------------------+ -|[INFO]: 2022-09-05 03:35:01.43 Looking for work...| -|[WARN]: 2022-09-05 03:35:58.007 Workers are una...| -|[INFO]: 2022-09-05 03:40:59.054 Looking for wor...| -|[INFO]: 2022-09-05 03:42:24 3 Workers were acqu...| -|[INFO]: 2022-09-05 03:42:37 Initializing instan...| -+--------------------------------------------------+ -only showing top 5 rows -... with 1 more columns: does_it_match? -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{Tools for dates and datetimes -manipulation}\label{sec-datetime-tools} - -Units of measurement that represent time are very commom types of data -in our modern world. Nowadays, dates and datetimes (or timestamps) are -the most commom units used to represent a specific point in time. In -this chapter, you will learn how to import, manipulate and use this kind -of data with \texttt{pyspark}. - -In Spark, dates and datetimes are represented by the \texttt{DateType} -and \texttt{TimestampType} data types, respectively, which are available -in the \texttt{pyspark.sql.types} module. Spark also offers two other -data types to represent ``intervals of time'', which are -\texttt{YearMonthIntervalType} and \texttt{DayTimeIntervalType}. -However, you usually don't use these types directly to create new -objects. In other words, they are intermediate types. They are a -passage, or a path you use to get to another data type. - -\section{Creating date values}\label{sec-create-dates} - -Dates are normally interpreted in \texttt{pyspark} using the -\texttt{DateType} data type. There are three commom ways to create date -objects, which are: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - from strings (like \texttt{"3\ of\ June\ of\ 2023"}, or maybe, - \texttt{"2023-02-05"}). -\item - by extracting the date component from datetime values (i.e.~values of - type \texttt{TimestampType}). -\item - by combining day, month and year components to build a date object. -\end{enumerate} - -\subsection{From strings}\label{from-strings} - -When you have a \texttt{StringType} column in your DataFrame that -contains dates that are currently being stored inside strings, and you -want to convert this column into a \texttt{DateType} column, you -basically have two choices: 1) use the automatic column conversion with -\texttt{cast()} or \texttt{astype()}; 2) use the \texttt{to\_date()} -Spark SQL function to convert the strings using a specific date format. - -When you use the \texttt{cast()} (or \texttt{astype()}) column method -that we introduced at Section~\ref{sec-cast-column-type}, Spark will -perform a quick and automatic conversion to \texttt{DateType} by casting -the strings you have into the \texttt{DateType}. But when you use this -method, Spark will always assume that the dates you have are in the -ISO-8601 format, which is the international standard for dates. This -format is presented at Figure~\ref{fig-iso-8601-dates}: - -\begin{figure} - -\centering{ - -\includegraphics[width=0.8\textwidth,height=\textheight]{Chapters/./../Figures/format_date.png} - -} - -\caption{\label{fig-iso-8601-dates}The ISO-8601 format for dates} - -\end{figure}% - -Basically, the ISO-8601 standard specifies that dates are represented in -the format ``YYYY-MM-DD'' (or ``Year-Month-Date''), like -\texttt{2023-09-19}, \texttt{1926-05-21}, or \texttt{2005-11-01}. This -is also the format that dates are usually formatted in United States of -America. So, if the dates you have (which are currently stored inside -strings) are formatted like the ISO-8601 standard, then, you can safely -and easily convert them into the \texttt{DateType} by using the -\texttt{cast()} or \texttt{astype()} column methods. - -However, if these dates are formatted in a different way, then, the -\texttt{cast()} method will very likely produce \texttt{null} values as -a result, because it cannot parse dates that are outside the ISO-8601 -format. If that is your case, then you should use the -\texttt{to\_date()} function, which allows you to specify the exact -format of your dates. - -There are many examples of date formats which are outside of the -ISO-8601 format. Like: - -\begin{itemize} -\tightlist -\item - In Brazil and Spain, dates are formatted as ``Day/Month/Year''. - Example: ``23/04/2022'' for April 23, 2022. -\item - In Japan, dates are formatted as ``year month day (weekday)'', with - the Japanese characters meaning ``year'', ``month'' and ``day'' - inserted after the numerals. Example: 2008年12月31日 (水) for - ``Wednesday 31 December 2008''. -\item - Many websites display dates using the full name of the month, like - ``November 18, 2023''. This is an important fact considering that - web-scrapping is a real and important area of data analysis these - days. -\end{itemize} - -I will describe at Section~\ref{sec-datetime-patterns} how you can use -the \texttt{to\_date()} function to convert dates that are outside of -the ISO format to \texttt{DateType} values. But for now, for simplicity -sake, I will consider only strings that contains date in the ISO format. - -As a first example, lets consider the DataFrame \texttt{df} below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ Row} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ Row(date\_registered }\OperatorTok{=} \StringTok{"2021{-}01{-}01"}\NormalTok{),} -\NormalTok{ Row(date\_registered }\OperatorTok{=} \StringTok{"2021{-}01{-}01"}\NormalTok{),} -\NormalTok{ Row(date\_registered }\OperatorTok{=} \StringTok{"2021{-}01{-}02"}\NormalTok{),} -\NormalTok{ Row(date\_registered }\OperatorTok{=} \StringTok{"2021{-}01{-}03"}\NormalTok{)} -\NormalTok{]} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\end{Highlighting} -\end{Shaded} - -If we look at the DataFrame schema of \texttt{df}, we can see that the -\texttt{date\_registered} column is currently being interpreted as a -column of type \texttt{StringType}: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- date_registered: string (nullable = true) -\end{verbatim} - -Since the dates from the \texttt{date\_registered} column are formatted -like the ISO-8601 standard, we can safely use \texttt{cast()} to get a -column of type \texttt{DateType}. And if we look again at the DataFrame -schema after the transformation, we can certify that the -\texttt{date\_registered} column is in fact now, a column of type -\texttt{DateType}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ df.withColumn(} - \StringTok{\textquotesingle{}date\_registered\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}date\_registered\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{)} -\NormalTok{)} - -\NormalTok{df.show()} -\end{Highlighting} -\end{Shaded} - - -\begin{verbatim} -+---------------+ -|date_registered| -+---------------+ -| 2021-01-01| -| 2021-01-01| -| 2021-01-02| -| 2021-01-03| -+---------------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- date_registered: date (nullable = true) -\end{verbatim} - -\subsection{From datetime values}\label{from-datetime-values} - -A datetime value is a value that contains both a date component and a -time component. But you can obviously extract just the date component -from a datetime value. Let's use the following DataFrame as example: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql }\ImportTok{import}\NormalTok{ Row} -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ datetime} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ \{}\StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{: datetime(}\DecValTok{2021}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{10}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{)\},} -\NormalTok{ \{}\StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{: datetime(}\DecValTok{2021}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{12}\NormalTok{, }\DecValTok{18}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{)\},} -\NormalTok{ \{}\StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{: datetime(}\DecValTok{2021}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{13}\NormalTok{, }\DecValTok{7}\NormalTok{, }\DecValTok{0}\NormalTok{, }\DecValTok{0}\NormalTok{)\},} -\NormalTok{ \{}\StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{: datetime(}\DecValTok{2021}\NormalTok{, }\DecValTok{6}\NormalTok{, }\DecValTok{14}\NormalTok{, }\DecValTok{19}\NormalTok{, }\DecValTok{30}\NormalTok{, }\DecValTok{0}\NormalTok{)\}} -\NormalTok{]} - -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\NormalTok{df.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- as_datetime: timestamp (nullable = true) -\end{verbatim} - -You can extract the date component from the \texttt{as\_datetime} column -by directly casting the column into the \texttt{DateType} type. Like you -would normally do with a string column. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df.withColumn(}\StringTok{\textquotesingle{}date\_component\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}date\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+--------------+ -| as_datetime|date_component| -+-------------------+--------------+ -|2021-06-12 10:00:00| 2021-06-12| -|2021-06-12 18:00:00| 2021-06-12| -|2021-06-13 07:00:00| 2021-06-13| -|2021-06-14 19:30:00| 2021-06-14| -+-------------------+--------------+ -\end{verbatim} - -\subsection{From individual -components}\label{from-individual-components} - -If you have 3 columns in your DataFrame, one for each component of a -date value (day, month, year), you can group these components together -to form date values. In \texttt{pyspark} you do this by using the -\texttt{make\_date()} function. - -To use this function, you just list the columns of each component in the -following order: year, month and day. Like in this example: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ make\_date} -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ spark.createDataFrame([} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{14}\NormalTok{,month}\OperatorTok{=}\DecValTok{2}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{30}\NormalTok{,month}\OperatorTok{=}\DecValTok{4}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{2}\NormalTok{,month}\OperatorTok{=}\DecValTok{5}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{6}\NormalTok{,month}\OperatorTok{=}\DecValTok{5}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{)} -\NormalTok{])} - -\NormalTok{df3}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}as\_date\textquotesingle{}}\NormalTok{,} -\NormalTok{ make\_date(} -\NormalTok{ col(}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}month\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}day\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+----+----------+ -|day|month|year| as_date| -+---+-----+----+----------+ -| 14| 2|2021|2021-02-14| -| 30| 4|2021|2021-04-30| -| 2| 5|2021|2021-05-02| -| 6| 5|2021|2021-05-06| -+---+-----+----+----------+ -\end{verbatim} - -\section{Creating datetime values}\label{creating-datetime-values} - -Datetime values are values that contains both a date and a time -components. These datetime values might also come with a time zone -component, although this component is not mandatory. - -Different programming languages and frameworks might have different -names for these kind of values. Because of that, you might know datetime -values by a different name. For example, ``timestamps'' is also a very -popular name for this kind of values. Anyway, in \texttt{pyspark}, -datetime (or timestamp) values are interpreted by the -\texttt{TimestampType} data type. - -There are three commom ways to create datetime objects, which are: - -\begin{enumerate} -\def\labelenumi{\arabic{enumi}.} -\tightlist -\item - from strings (like \texttt{"2023-02-05\ 12:30:00"}). -\item - from integer values (i.e.~\texttt{IntegerType} and \texttt{LongType}). -\item - by combining the individual components of a datetime value (day, - month, year, hour, minute, second, etc.) to form a complete datetime - value. -\end{enumerate} - -\subsection{From strings}\label{from-strings-1} - -Again, if your datetime values are being currently stored inside -strings, and, you want to convert them to datetime values, you can use -the automatic conversion of Spark with the \texttt{cast()} column -method. However, as we stated at Section~\ref{sec-create-dates}, when -you use this path, Spark will always assume that your datetime values -follow the ISO-8601 format. - -The ISO-8601 standard states that datetime values should always follow -the format ``YYYY-MM-DD HH:MM:SS Z'' (or ``Year-Month-Day -Hour:Minute:Second TimeZone''). You can see at -Figure~\ref{fig-iso-8601-datetime}, that are components that are -mandatory (meaning that they always appear in a datetime value), and -components that are optional (meaning that they might appear or might -not in a datetime value). - -The time zone component is always optional, and because of that, they -usually are not present in the datetime values you have. There are some -variations and extra characters that might appear at some points, for -example, the character `T' might appear to clearly separate the date -from the time component, and the character `Z' to separate the time zone -from the time component. Also, the time component might have a -microseconds section to identify a more precise point in time. - -But despite all these variations, datetime values that are ISO-8601 -compatible are basically always following this same pattern (or this -same order of components) of ``Year-Month-Day Hour:Minute:Second -TimeZone''. Examples of values that follows this standard are -``2014-10-18T16:30:00Z'', ``2019-05-09 08:20:05.324'' and ``2020-01-01 -12:30:00 -03''. - -\begin{figure} - -\centering{ - -\includegraphics[width=1\textwidth,height=\textheight]{Chapters/./../Figures/format_datetime.png} - -} - -\caption{\label{fig-iso-8601-datetime}The ISO-8601 format for -datetime/timestamp values} - -\end{figure}% - -This means that if your datetime values are not in this format, if they -do not follow the ISO-8601 standard, then, you should not use the -\texttt{cast()} method. If that is your case, you should use datetime -patterns with the \texttt{to\_timestamp()} function. We describe these -assets in more depth at Section~\ref{sec-datetime-patterns}. - -For now, let's focus on examples that use the ISO-8601 format. As an -example, lets use the \texttt{df2} DataFrame below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ Row(datetime\_as\_string }\OperatorTok{=} \StringTok{"2021{-}02{-}23T04:41:57Z"}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_string }\OperatorTok{=} \StringTok{"2021{-}05{-}18T12:30:05Z"}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_string }\OperatorTok{=} \StringTok{"2021{-}11{-}13T16:30:00Z"}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_string }\OperatorTok{=} \StringTok{"2021{-}08{-}09T00:30:16Z"}\NormalTok{)} -\NormalTok{]} - -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\NormalTok{df2.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- datetime_as_string: string (nullable = true) -\end{verbatim} - -You can see above, that the \texttt{datetime\_as\_string} column is -currently being interpreted as a column of strings. But since the values -are in ISO-8601 format, I can use the cast method to directly convert -them into timestamp values. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df2 }\OperatorTok{=}\NormalTok{ df2}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}datetime\_values\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetime\_as\_string\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}timestamp\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{df2.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- datetime_as_string: string (nullable = true) - |-- datetime_values: timestamp (nullable = true) -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df2.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+-------------------+ -| datetime_as_string| datetime_values| -+--------------------+-------------------+ -|2021-02-23T04:41:57Z|2021-02-23 01:41:57| -|2021-05-18T12:30:05Z|2021-05-18 09:30:05| -|2021-11-13T16:30:00Z|2021-11-13 13:30:00| -|2021-08-09T00:30:16Z|2021-08-08 21:30:16| -+--------------------+-------------------+ -\end{verbatim} - -\subsection{From integers}\label{from-integers} - -You can also convert integers directly to datetime values by using the -\texttt{cast()} method. In this situation, the integers are interpreted -as being the number of seconds since the UNIX time epoch, which is -mid-night of 1 January of 1970 (\texttt{"1970-01-01\ 00:00:00"}). In -other words, the integer \texttt{60} will be converted the point in time -that is 60 seconds after \texttt{"1970-01-01\ 00:00:00"}, which would be -\texttt{"1970-01-01\ 00:01:00"}. - -In the example below, the number 1,000,421,325 is converted into -19:48:45 of 13 September of 2001. Because this exact point in time is -1.000421 billion of seconds ahead of the UNIX epoch. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ spark.createDataFrame([} -\NormalTok{ Row(datetime\_as\_integer }\OperatorTok{=} \DecValTok{1000421325}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_integer }\OperatorTok{=} \DecValTok{1000423628}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_integer }\OperatorTok{=} \DecValTok{500}\NormalTok{),} -\NormalTok{ Row(datetime\_as\_integer }\OperatorTok{=} \DecValTok{1000493412}\NormalTok{)} -\NormalTok{])} - -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ df3}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}datetime\_values\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetime\_as\_integer\textquotesingle{}}\NormalTok{).cast(}\StringTok{\textquotesingle{}timestamp\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{df3.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------------+ -|datetime_as_integer| datetime_values| -+-------------------+-------------------+ -| 1000421325|2001-09-13 19:48:45| -| 1000423628|2001-09-13 20:27:08| -| 500|1969-12-31 21:08:20| -| 1000493412|2001-09-14 15:50:12| -+-------------------+-------------------+ -\end{verbatim} - -However, you probably notice in the example above, that something is -odd. Because the number 500 was converted into -\texttt{"1969-12-31\ 21:08:20"}, which is in theory, behind the UNIX -epoch, which is 1 January of 1970. Why did that happen? The answer is -that \textbf{your time zone is always taken into account} during a -conversion from integers to datetime values! - -In the example above, Spark is running on an operating system that is -using the America/Sao\_Paulo time zone (which is 3 hours late from -international time zone - UTC-3) as the ``default time zone'' of the -system. As a result, integers will be interpreted as being the number of -seconds since the UNIX time epoch \textbf{minus 3 hours}, which is -\texttt{"1969-12-31\ 21:00:00"}. So, in this context, the integer -\texttt{60} would be converted into \texttt{"1969-12-31\ 21:01:00"} -(instead of the usual \texttt{"1970-01-01\ 00:01:00"} that you would -expect). - -That is why the number 500 was converted into -\texttt{"1969-12-31\ 21:08:20"}. Because it is 500 seconds ahead of -\texttt{"1969-12-31\ 21:00:00"}, which is 3 hours behind the UNIX time -epoch. - -But what if you wanted to convert your integers into a UTC-0 time zone? -Well, you could just set the time zone of your Spark Session to the -international time zone before you do the conversion, with the command -below: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{spark.conf.}\BuiltInTok{set}\NormalTok{(}\StringTok{"spark.sql.session.timeZone"}\NormalTok{, }\StringTok{"UTC"}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -But you could also do the conversion anyway and adjust the values later. -That is, you just perform the conversion with the \texttt{cast()} -method, even if the result would be in your current time zone. After -that, you add the amount of time necessary to transpose your datetime -values to the international time zone (UTC-0). - -So in the above example, since I was using the Brasília time zone -(UTC-3) during the above conversion, I just need to add 3 hours to all -datetime values to get their equivalents values in international time -zone. You can do that by using interval expressions, which will be -discussed in more depth at Section~\ref{sec-interval-express}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ expr} -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ df3}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}datetime\_values\_utc0\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{"datetime\_values + INTERVAL 3 HOURS"}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{df3.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------------+--------------------+ -|datetime_as_integer| datetime_values|datetime_values_utc0| -+-------------------+-------------------+--------------------+ -| 1000421325|2001-09-13 19:48:45| 2001-09-13 22:48:45| -| 1000423628|2001-09-13 20:27:08| 2001-09-13 23:27:08| -| 500|1969-12-31 21:08:20| 1970-01-01 00:08:20| -| 1000493412|2001-09-14 15:50:12| 2001-09-14 18:50:12| -+-------------------+-------------------+--------------------+ -\end{verbatim} - -\subsection{From individual -components}\label{from-individual-components-1} - -If you have 6 columns in your DataFrame, one for each component of a -datetime value (day, month, year, hour, minute, second), you can group -these components together to compose datetime values. We do this in -\texttt{pyspark} by using the \texttt{make\_timestamp()} function. - -To use this function, you just list the columns of each component in the -following order: year, month, day, hours, minutes, seconds. Like in this -example: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ make\_timestamp} -\NormalTok{df3 }\OperatorTok{=}\NormalTok{ spark.createDataFrame([} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{14}\NormalTok{,month}\OperatorTok{=}\DecValTok{2}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{,hour}\OperatorTok{=}\DecValTok{12}\NormalTok{,mins}\OperatorTok{=}\DecValTok{45}\NormalTok{,secs}\OperatorTok{=}\DecValTok{0}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{30}\NormalTok{,month}\OperatorTok{=}\DecValTok{4}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{,hour}\OperatorTok{=}\DecValTok{8}\NormalTok{,mins}\OperatorTok{=}\DecValTok{10}\NormalTok{,secs}\OperatorTok{=}\DecValTok{0}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{2}\NormalTok{,month}\OperatorTok{=}\DecValTok{5}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{,hour}\OperatorTok{=}\DecValTok{5}\NormalTok{,mins}\OperatorTok{=}\DecValTok{9}\NormalTok{,secs}\OperatorTok{=}\DecValTok{12}\NormalTok{),} -\NormalTok{ Row(day}\OperatorTok{=}\DecValTok{6}\NormalTok{,month}\OperatorTok{=}\DecValTok{5}\NormalTok{,year}\OperatorTok{=}\DecValTok{2021}\NormalTok{,hour}\OperatorTok{=}\DecValTok{0}\NormalTok{,mins}\OperatorTok{=}\DecValTok{34}\NormalTok{,secs}\OperatorTok{=}\DecValTok{4}\NormalTok{)} -\NormalTok{])} - -\NormalTok{df3}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}as\_datetime\textquotesingle{}}\NormalTok{,} -\NormalTok{ make\_timestamp(} -\NormalTok{ col(}\StringTok{\textquotesingle{}year\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}month\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}day\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}hour\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}mins\textquotesingle{}}\NormalTok{),} -\NormalTok{ col(}\StringTok{\textquotesingle{}secs\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+----+----+----+----+-------------------+ -|day|month|year|hour|mins|secs| as_datetime| -+---+-----+----+----+----+----+-------------------+ -| 14| 2|2021| 12| 45| 0|2021-02-14 12:45:00| -| 30| 4|2021| 8| 10| 0|2021-04-30 08:10:00| -| 2| 5|2021| 5| 9| 12|2021-05-02 05:09:12| -| 6| 5|2021| 0| 34| 4|2021-05-06 00:34:04| -+---+-----+----+----+----+----+-------------------+ -\end{verbatim} - -\section{Introducing datetime patterns}\label{sec-datetime-patterns} - -Every time you have strings that contains dates and datetime values that -are outside of the ISO-8601 format, you usually have to use datetime -patterns to convert these strings to date or datetime values. In other -words, in this section I will describe how you can use datetime patterns -to convert string values (that are outside of the ISO-8601 format) into -dates or datetimes values. - -Despite the existence of an international standard (like ISO-8601), the -used date and datetime formats varies accross different countries around -the world. There are tons of examples of these different formats. For -example, in Brazil, dates are usually formatted like ``Day/Month/Year''. -Not only the order of the date components (day, month and year) are -different, but also, the separator character (``/'' instead of ``-''). - -In essence, a datetime pattern is a string pattern that describes a -specific date or datetime format. This means that we can use a datetime -pattern to describe any date or datetime format. A datetime pattern is a -string value that is constructed by grouping letters together. Each -individual letter represents an individual component of a date or a -datetime value. - -You can see at Table~\ref{tbl-date-components} a list of the most commom -used letters in datetime patterns. If you want, you can also see the -full list of possible letters in datetime patterns by visiting the Spark -Datetime Patterns page\footnote{\url{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}}. - -\begin{longtable}[]{@{} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.0789}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.3158}} - >{\raggedright\arraybackslash}p{(\columnwidth - 4\tabcolsep) * \real{0.6053}}@{}} -\caption{List of letters to represent date and datetime -components}\label{tbl-date-components}\tabularnewline -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Letter -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of a valid value -\end{minipage} \\ -\midrule\noalign{} -\endfirsthead -\toprule\noalign{} -\begin{minipage}[b]{\linewidth}\raggedright -Letter -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Meaning -\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright -Example of a valid value -\end{minipage} \\ -\midrule\noalign{} -\endhead -\bottomrule\noalign{} -\endlastfoot -G & era & AD; Anno Domini \\ -y & year & 2020; 20 \\ -D & day-of-year & 189 \\ -M/L & month-of-year & 7; 07; Jul; July \\ -d & day-of-month & 28 \\ -Q/q & quarter-of-year & 3; 03; Q3; 3rd quarter \\ -E & day-of-week & Tue; Tuesday \\ -F & aligned day of week & 3 \\ -a & am-pm-of-day & PM \\ -h & clock-hour-of-am-pm & 12 \\ -K & hour-of-am-pm & 0 \\ -k & clock-hour-of-day & 0 \\ -H & hour-of-day & 0 \\ -m & minute-of-hour & 30 \\ -s & second-of-minute & 55 \\ -S & fraction-of-second & 978 \\ -V & time-zone ID & America/Los\_Angeles; Z; -08:30 \\ -z & time-zone name & Pacific Standard Time; PST \\ -O & localized zone-offset & GMT+8; GMT+08:00; UTC-08:00; \\ -X & zone-offset `Z' for zero & Z; -08; -0830; -08:30; -083015; --08:30:15; \\ -x & zone-offset & +0000; -08; -0830; -08:30; -083015; -08:30:15; \\ -Z & zone-offset & +0000; -0800; -08:00; \\ -\end{longtable} - -\subsection{Using datetime patterns to get date -values}\label{using-datetime-patterns-to-get-date-values} - -Following Table~\ref{tbl-date-components}, if we had a date in the -format ``Day, Month of Year'', like ``12, November of 1997'', we would -use the letters d, M and y for each of the three components that are -present in this format. In fact, let's create a DataFrame with this -exact value, and let's demonstrate how could you convert it to a -\texttt{DateType} value. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [ \{}\StringTok{"date"}\NormalTok{: }\StringTok{"12, November of 1997"}\NormalTok{\} ]} -\NormalTok{weird\_date }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data)} -\NormalTok{weird\_date.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+--------------------+ -| date| -+--------------------+ -|12, November of 1997| -+--------------------+ -\end{verbatim} - -To convert it from the \texttt{StringType} to \texttt{DateType} we have -to use the \texttt{to\_date()} Spark SQL function. Because with this -function, we can provide the datetime pattern that describes the exact -format that these dates use. But before we use \texttt{to\_date()}, we -need to build a datetime pattern to use. - -The date example above (``12, November of 1997'') starts with a -two-digit day number. That is why we begin our datetime pattern with two -d's, to represent this section of the date. After that we have a comma -and a space followed by the month name. However, both month name and the -year number at the end of the date are in their full formats, instead of -their abbreviated formats. - -Because of that, we need to tell Spark to use the full format instead of -the abbreviated format on both of these two components. To do that, we -use four M's and four y's, instead of just two. At last, we have a -literal ``of'' between the month name and the year name, and to describe -this specific section of the date, we insert -\texttt{\textquotesingle{}of\textquotesingle{}} between the M's and y's. - -In essence, we have the datetime pattern -\texttt{"dd,\ MMMM\ \textquotesingle{}of\textquotesingle{}\ yyyy"}. Now, -we can just use \texttt{to\_date()} with this datetime pattern to -convert the string value to a date value. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ to\_date} -\NormalTok{pattern }\OperatorTok{=} \StringTok{"dd, MMMM \textquotesingle{}of\textquotesingle{} yyyy"} -\NormalTok{weird\_date }\OperatorTok{=}\NormalTok{ weird\_date}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{"date"}\NormalTok{, to\_date(col(}\StringTok{"date"}\NormalTok{), pattern))} - -\NormalTok{weird\_date.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+ -| date| -+----------+ -|1997-11-12| -+----------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{weird\_date.printSchema()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -root - |-- date: date (nullable = true) -\end{verbatim} - -So, if you have a constant (or fixed) text that is always present in all -of your date values, like the ``of'' in the above example, you must -encapsulate this text between quotation marks in your datetime patterns. -Also, depending if your components are in a abbreviated format (like -``02'' for year 2002, or ``Jan'' for the January month), or in a full -format (like the year ``1997'' or the month ``October''), you might want -to repeat the letters one or two times (to use abbreviated formats), or -you might want to repeat it four times (to use the full formats). In -other words, if you use less than 4 pattern letters, then, Spark will -use the short text form, typically an abbreviation form of the component -you are reffering to \emph{Apache Spark Official Documentation} (2022). - -As another example of unusual date formats, lets use the -\texttt{user\_events} DataFrame. You can easily get the data of this -DataFrame by dowloading the JSON file from the official repository of -this book\footnote{\url{https://github.com/pedropark99/Introd-pyspark/tree/main/Data}}. -After you downloaded the JSON file that contains the data, you can -import it to your Spark session with the commands below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StructType, StructField} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ DoubleType, StringType} -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ LongType, TimestampType, DateType} -\NormalTok{path }\OperatorTok{=} \StringTok{"../Data/user{-}events.json"} -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}userId\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}nameOfEvent\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{False}\NormalTok{)} -\NormalTok{])} - -\NormalTok{user\_events }\OperatorTok{=}\NormalTok{ spark.read}\OperatorTok{\textbackslash{}} -\NormalTok{ .json(path, schema }\OperatorTok{=}\NormalTok{ schema)} - -\NormalTok{user\_events.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+-------------------+--------------------+ -|dateOfEvent| timeOfEvent| userId| -+-----------+-------------------+--------------------+ -| 15/06/2022|15/06/2022 14:33:10|b902e51e-d043-4a6...| -| 15/06/2022|15/06/2022 14:40:08|b902e51e-d043-4a6...| -| 15/06/2022|15/06/2022 15:48:41|b902e51e-d043-4a6...| -+-----------+-------------------+--------------------+ -... with 1 more columns: nameOfEvent -\end{verbatim} - -The \texttt{dateOfEvent} column of this DataFrame contains date values -in the Brazilian format ``Day/Month/Year''. To describe this date -format, we can use the datetime pattern \texttt{"dd/MM/yyyy"}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{date\_pattern }\OperatorTok{=} \StringTok{"dd/MM/yyyy"} -\NormalTok{user\_events }\OperatorTok{=}\NormalTok{ user\_events}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{,} -\NormalTok{ to\_date(col(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{), date\_pattern)} -\NormalTok{ )} - -\NormalTok{user\_events.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+-------------------+--------------------+ -|dateOfEvent| timeOfEvent| userId| -+-----------+-------------------+--------------------+ -| 2022-06-15|15/06/2022 14:33:10|b902e51e-d043-4a6...| -| 2022-06-15|15/06/2022 14:40:08|b902e51e-d043-4a6...| -| 2022-06-15|15/06/2022 15:48:41|b902e51e-d043-4a6...| -+-----------+-------------------+--------------------+ -... with 1 more columns: nameOfEvent -\end{verbatim} - -\subsection{Using datetime patterns to get datetime/timestamp -values}\label{using-datetime-patterns-to-get-datetimetimestamp-values} - -Furthermore, the \texttt{user\_events} table also contains the -\texttt{timeOfEvent} column, which is a column of datetime (or -timestamp) values, also in the usual Brazilian format ``Day/Month/Year -Hour:Minutes:Seconds''. Following Table~\ref{tbl-date-components}, we -can describe this datetime format with the pattern -\texttt{"dd/MM/yyyy\ HH:mm:ss"}. - -When you have a column of string values that you want to convert into -the \texttt{TimestampType} type, but your values are not in ISO-8601 -format, you should use the \texttt{to\_timestamp()} function (which also -comes from the \texttt{pyspark.sql.functions} module) together with a -datetime pattern to describe the actual format of your values. - -You use the \texttt{to\_timestamp()} function in the same way you would -use the \texttt{to\_date()} function, you reference the name of the -column you want to convert into a column of timestamp values, and you -also provide a datetime pattern to be used in the conversion. The -difference between the two functions is solely on the type of columns -they produce. The \texttt{to\_timestamp()} function always produce a -column of type \texttt{TimestampType} as a result, while the -\texttt{to\_date()} function produces a column of type -\texttt{DateType}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ to\_timestamp} -\NormalTok{datetime\_pattern }\OperatorTok{=} \StringTok{"dd/MM/yyyy HH:mm:ss"} -\NormalTok{user\_events }\OperatorTok{=}\NormalTok{ user\_events}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{,} -\NormalTok{ to\_timestamp(col(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{), datetime\_pattern)} -\NormalTok{ )} - -\NormalTok{user\_events.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+-------------------+--------------------+ -|dateOfEvent| timeOfEvent| userId| -+-----------+-------------------+--------------------+ -| 2022-06-15|2022-06-15 14:33:10|b902e51e-d043-4a6...| -| 2022-06-15|2022-06-15 14:40:08|b902e51e-d043-4a6...| -| 2022-06-15|2022-06-15 15:48:41|b902e51e-d043-4a6...| -+-----------+-------------------+--------------------+ -... with 1 more columns: nameOfEvent -\end{verbatim} - -\section{Extracting date or datetime -components}\label{extracting-date-or-datetime-components} - -One very commom operation when dealing with dates and datetime values is -extracting components from these values. For example, you might want to -create a new column that contains the day component from all of your -dates. Or maybe, the month\ldots{} anyway. - -Thakfully, \texttt{pyspark} made it pretty easy to extract these kinds -of components. You just use the corresponding function to each -component! All of these functions come from the -\texttt{pyspark.sql.functions} module. As a quick list, the functions -below are used to extract components from both dates and datetime -values: - -\begin{itemize} -\tightlist -\item - \texttt{year()}: extract the year of a given date or datetime as - integer. -\item - \texttt{month()}: extract the month of a given date or datetime as - integer. -\item - \texttt{dayofmonth()}: extract the day of the month of a given date or - datetime as integer. -\item - \texttt{dayofweek()}: extract the day of the week of a given date or - datetime as integer. The integer returned will be inside the range 1 - (for a Sunday) through to 7 (for a Saturday). -\item - \texttt{dayofyear()}: extract the day of the year of a given date or - datetime as integer. -\item - \texttt{quarter()}: extract the quarter of a given date or datetime as - integer. -\end{itemize} - -On the other hand, the list of functions below only applies to datetime -values, because they extract time components (which are not present on -date values). These functions also comes to the -\texttt{pyspark.sql.functions} module: - -\begin{itemize} -\tightlist -\item - \texttt{hour()}: extract the hours of a given datetime as integer. -\item - \texttt{minute()}: extract the minutes of a given datetime as integer. -\item - \texttt{second()}: extract the seconds of a given datetime as integer. -\end{itemize} - -In essence, you just apply these functions at the column that contains -your date and datetime values, and you get as output the component you -want. As an example, lets go back to the \texttt{user\_events} DataFrame -and extract the day, month and year components of each date, into -separate columns: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ dayofmonth,} -\NormalTok{ month,} -\NormalTok{ year} -\NormalTok{)} - -\NormalTok{user\_events}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}dayOfEvent\textquotesingle{}}\NormalTok{, dayofmonth(col(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{)))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}monthOfEvent\textquotesingle{}}\NormalTok{, month(col(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{)))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}yearOfEvent\textquotesingle{}}\NormalTok{, year(col(}\StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{)))}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}dateOfEvent\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}dayOfEvent\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}monthOfEvent\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}yearOfEvent\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-----------+----------+------------+-----------+ -|dateOfEvent|dayOfEvent|monthOfEvent|yearOfEvent| -+-----------+----------+------------+-----------+ -| 2022-06-15| 15| 6| 2022| -| 2022-06-15| 15| 6| 2022| -| 2022-06-15| 15| 6| 2022| -+-----------+----------+------------+-----------+ -\end{verbatim} - -\section{Adding time to date and datetime values with interval -expressions}\label{sec-interval-express} - -Another very commom operation is to add some amount of time to a date or -datetime values. For example, you might want to advance your dates by 3 -days. Or, delay your datetime values by 3 hours. In \texttt{pyspark}, -you can perform this kind by either using functions or interval -expressions. - -When we talk about functions available through the -\texttt{pyspark.sql.functions} module, we have \texttt{date\_add()} and -\texttt{date\_sub()}, which you can use to add or subtract days to a -date, and \texttt{add\_months()}, which you can use to add months to a -date. Yes, the naming pattern of these functions is a little bit weird, -but let's ignore this fact. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ date\_add,} -\NormalTok{ date\_sub,} -\NormalTok{ add\_months} -\NormalTok{)} - -\NormalTok{user\_events}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timePlus10Days\textquotesingle{}}\NormalTok{,} -\NormalTok{ date\_add(col(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{), }\DecValTok{10}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timeMinus5Days\textquotesingle{}}\NormalTok{,} -\NormalTok{ date\_sub(col(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{), }\DecValTok{5}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timePlus8Months\textquotesingle{}}\NormalTok{,} -\NormalTok{ add\_months(col(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{), }\DecValTok{8}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}timePlus10Days\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}timeMinus5Days\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}timePlus8Months\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+--------------+--------------+ -| timeOfEvent|timePlus10Days|timeMinus5Days| -+-------------------+--------------+--------------+ -|2022-06-15 14:33:10| 2022-06-25| 2022-06-10| -|2022-06-15 14:40:08| 2022-06-25| 2022-06-10| -|2022-06-15 15:48:41| 2022-06-25| 2022-06-10| -+-------------------+--------------+--------------+ -... with 1 more columns: timePlus8Months -\end{verbatim} - -Now, you can use interval expressions to add whatever time unit you want -(years, months, days, hours, minutes or seconds) to either a date or -datetime value. However, interval expressions are only available at the -Spark SQL level. As a consequence, to use an interval expression, you -must use \texttt{expr()} or \texttt{spark.sql()} (that we introduced at -Chapter~\ref{sec-dataframe-sql-chapter}) to get access to Spark SQL. An -interval expression follows this pattern: - -\begin{Shaded} -\begin{Highlighting}[] -\DataTypeTok{INTERVAL} \DecValTok{3}\NormalTok{ HOURS} -\end{Highlighting} -\end{Shaded} - -An interval expression is an expression that begins with the -\texttt{INTERVAL} keyword, and then, it is followed by a number that -specifies an amount. You define how much exactly this amout represents -by using another keyword that specifies the unit of time to be used. So -the keyword \texttt{HOUR} specifies that you want to use the ``hour'' -unit of time. - -Having this in mind, the expression \texttt{INTERVAL\ 3\ HOURS} defines -a 3 hours interval expression. In contrast, the expression -\texttt{INTERVAL\ 3\ SECONDS} represents 3 seconds, and the expression -\texttt{INTERVAL\ 90\ MINUTES} represents 90 minutes, and so on. - -By having an interval expression, you can just add or subtract this -interval to your column to change the timestamp you have. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ expr} -\NormalTok{user\_events}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timePlus3Hours\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{\textquotesingle{}timeOfEvent + INTERVAL 3 HOURS\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}timePlus2Years\textquotesingle{}}\NormalTok{,} -\NormalTok{ expr(}\StringTok{\textquotesingle{}timeOfEvent + INTERVAL 2 YEARS\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}timeOfEvent\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}timePlus3Hours\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}timePlus2Years\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------------+-------------------+ -| timeOfEvent| timePlus3Hours| timePlus2Years| -+-------------------+-------------------+-------------------+ -|2022-06-15 14:33:10|2022-06-15 17:33:10|2024-06-15 14:33:10| -|2022-06-15 14:40:08|2022-06-15 17:40:08|2024-06-15 14:40:08| -|2022-06-15 15:48:41|2022-06-15 18:48:41|2024-06-15 15:48:41| -+-------------------+-------------------+-------------------+ -\end{verbatim} - -\section{Calculating differences between dates and datetime -values}\label{calculating-differences-between-dates-and-datetime-values} - -Another very commom operation is to calculate how much time is between -two dates or two datetimes. In other words, you might want to calculate -how many days the date \texttt{"2023-05-12"} is far ahead of -\texttt{"2023-05-05"}, or how many hours the timestamp -\texttt{"2023-07-10\ 13:13:00"} is ahead of -\texttt{"2023-07-10\ 05:18:00"}. - -When you have two columns of type \texttt{DateType} (or -\texttt{TimestampType}), you subtract one by the other directly to get -the difference between the values of these columns. As an example, lets -use the \texttt{df4} below: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ random }\ImportTok{import}\NormalTok{ randint, seed} -\ImportTok{from}\NormalTok{ datetime }\ImportTok{import}\NormalTok{ (} -\NormalTok{ date,} -\NormalTok{ datetime,} -\NormalTok{ timedelta} -\NormalTok{)} - -\NormalTok{seed(}\DecValTok{10}\NormalTok{)} -\NormalTok{dates }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ date(}\DecValTok{2023}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{1}\NormalTok{) }\OperatorTok{+}\NormalTok{ timedelta(days }\OperatorTok{=}\NormalTok{ randint(}\DecValTok{1}\NormalTok{,}\DecValTok{39}\NormalTok{))} - \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{4}\NormalTok{)} -\NormalTok{]} - -\NormalTok{datetimes }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ datetime(}\DecValTok{2023}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{8}\NormalTok{,}\DecValTok{14}\NormalTok{,}\DecValTok{54}\NormalTok{) }\OperatorTok{+}\NormalTok{ timedelta(hours }\OperatorTok{=}\NormalTok{ randint(}\DecValTok{1}\NormalTok{,}\DecValTok{39}\NormalTok{))} - \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{4}\NormalTok{)} -\NormalTok{]} - - -\NormalTok{df4 }\OperatorTok{=}\NormalTok{ spark.createDataFrame([} -\NormalTok{ Row(} -\NormalTok{ date1 }\OperatorTok{=}\NormalTok{ date(}\DecValTok{2023}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{1}\NormalTok{),} -\NormalTok{ date2 }\OperatorTok{=}\NormalTok{ dates[i],} -\NormalTok{ datetime1 }\OperatorTok{=}\NormalTok{ datetime(}\DecValTok{2023}\NormalTok{,}\DecValTok{4}\NormalTok{,}\DecValTok{1}\NormalTok{,}\DecValTok{8}\NormalTok{,}\DecValTok{14}\NormalTok{,}\DecValTok{54}\NormalTok{),} -\NormalTok{ datetime2 }\OperatorTok{=}\NormalTok{ datetimes[i]} -\NormalTok{ )} - \ControlFlowTok{for}\NormalTok{ i }\KeywordTok{in} \BuiltInTok{range}\NormalTok{(}\DecValTok{4}\NormalTok{)} -\NormalTok{])} - -\NormalTok{df4.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+----------+-------------------+-------------------+ -| date1| date2| datetime1| datetime2| -+----------+----------+-------------------+-------------------+ -|2023-04-01|2023-05-08|2023-04-01 08:14:54|2023-04-02 21:14:54| -|2023-04-01|2023-04-04|2023-04-01 08:14:54|2023-04-01 09:14:54| -|2023-04-01|2023-04-29|2023-04-01 08:14:54|2023-04-01 22:14:54| -|2023-04-01|2023-05-02|2023-04-01 08:14:54|2023-04-02 14:14:54| -+----------+----------+-------------------+-------------------+ -\end{verbatim} - -If we subtract columns \texttt{date2} by \texttt{date1} and -\texttt{datetime2} by \texttt{datetime1}, we get two new columns of type -\texttt{DayTimeIntervalType}. These new columns represent the interval -of time between the values of these columns. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df4}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}date1\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}date2\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}datediff\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}date2\textquotesingle{}}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ col(}\StringTok{\textquotesingle{}date1\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+----------+-----------------+ -| date1| date2| datediff| -+----------+----------+-----------------+ -|2023-04-01|2023-05-08|INTERVAL '37' DAY| -|2023-04-01|2023-04-04| INTERVAL '3' DAY| -|2023-04-01|2023-04-29|INTERVAL '28' DAY| -|2023-04-01|2023-05-02|INTERVAL '31' DAY| -+----------+----------+-----------------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df4}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}datetimediff\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ col(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------------+--------------------+ -| datetime1| datetime2| datetimediff| -+-------------------+-------------------+--------------------+ -|2023-04-01 08:14:54|2023-04-02 21:14:54|INTERVAL '1 13:00...| -|2023-04-01 08:14:54|2023-04-01 09:14:54|INTERVAL '0 01:00...| -|2023-04-01 08:14:54|2023-04-01 22:14:54|INTERVAL '0 14:00...| -|2023-04-01 08:14:54|2023-04-02 14:14:54|INTERVAL '1 06:00...| -+-------------------+-------------------+--------------------+ -\end{verbatim} - -But another very commom way to calculate this difference is to convert -the datetime values to seconds. Then, you subtract the calculated -seconds, so you get as output, the difference in seconds between the two -datetime values. - -To use this method, you first use the \texttt{unix\_timestamp()} -function to convert the timestamps into seconds as integer values, then, -you subtract these new integer values that were created. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ unix\_timestamp} -\NormalTok{df4 }\OperatorTok{=}\NormalTok{ df4}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{, unix\_timestamp(col(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{)))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{, unix\_timestamp(col(}\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{)))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}diffinseconds\textquotesingle{}}\NormalTok{,} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetime2\textquotesingle{}}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ col(}\StringTok{\textquotesingle{}datetime1\textquotesingle{}}\NormalTok{)} -\NormalTok{ )} - -\NormalTok{df4.show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+----------+-------------+ -| datetime1| datetime2|diffinseconds| -+----------+----------+-------------+ -|1680347694|1680480894| 133200| -|1680347694|1680351294| 3600| -|1680347694|1680398094| 50400| -|1680347694|1680455694| 108000| -+----------+----------+-------------+ -\end{verbatim} - -Now that we have the \texttt{diffinseconds} column, which represents the -difference \textbf{in seconds} between \texttt{datetime1} and -\texttt{datetime2} columns, we can divide this \texttt{diffinseconds} -column by 60 to get the difference in minutes, or divide by 3600 to get -the difference in hours, etc. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{df4}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}diffinminutes\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}diffinseconds\textquotesingle{}}\NormalTok{) }\OperatorTok{/} \DecValTok{60}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}diffinhours\textquotesingle{}}\NormalTok{, col(}\StringTok{\textquotesingle{}diffinseconds\textquotesingle{}}\NormalTok{) }\OperatorTok{/} \DecValTok{3600}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+----------+-------------+-------------+-----------+ -| datetime1| datetime2|diffinseconds|diffinminutes|diffinhours| -+----------+----------+-------------+-------------+-----------+ -|1680347694|1680480894| 133200| 2220.0| 37.0| -|1680347694|1680351294| 3600| 60.0| 1.0| -|1680347694|1680398094| 50400| 840.0| 14.0| -|1680347694|1680455694| 108000| 1800.0| 30.0| -+----------+----------+-------------+-------------+-----------+ -\end{verbatim} - -\section{Getting the now and today -values}\label{getting-the-now-and-today-values} - -If for some reason, you need to know which timestamp is now, or which -date is today, you can use the \texttt{current\_timestamp()} and -\texttt{current\_date()} functions from \texttt{pyspark.sql.functions} -module. When you execute these functions, they output the current -timestamp and date in your system. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.types }\ImportTok{import}\NormalTok{ StructType,StructField, StringType} -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ (} -\NormalTok{ current\_date,} -\NormalTok{ current\_timestamp,} -\NormalTok{ lit} -\NormalTok{)} - -\NormalTok{data }\OperatorTok{=}\NormalTok{ [Row(today}\OperatorTok{=}\StringTok{""}\NormalTok{, now}\OperatorTok{=}\StringTok{""}\NormalTok{)]} -\NormalTok{schema }\OperatorTok{=}\NormalTok{ StructType([} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}today\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{),} -\NormalTok{ StructField(}\StringTok{\textquotesingle{}now\textquotesingle{}}\NormalTok{, StringType(), }\VariableTok{True}\NormalTok{)} -\NormalTok{])} - -\NormalTok{spark.createDataFrame(data, schema }\OperatorTok{=}\NormalTok{ schema)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}today\textquotesingle{}}\NormalTok{, lit(current\_date()))}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}now\textquotesingle{}}\NormalTok{, lit(current\_timestamp()))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+----------+--------------------+ -| today| now| -+----------+--------------------+ -|2024-01-09|2024-01-09 11:37:...| -+----------+--------------------+ -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter{Introducing window functions}\label{sec-window-functions} - -Spark offers a set of tools known as \emph{window functions}. These -tools are essential for an extensive range of tasks, and you should know -them. But what are them? - -Window functions in Spark are a set functions that performs calculations -over windows of rows from your DataFrame. This is not a concept -exclusive to Spark. In fact, window functions in Spark are essentially -the same -\href{https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html}{as -window functions in MySQL}\footnote{\url{https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html}}. - -When you use a window function, the rows of your DataFrame are divided -into multiple windows. Each window contains a specific range of rows -from the DataFrame. In this context, a window function is a function -that receives a window (or a range of rows) as input, and calculates an -aggregate or a specific index based on the set of rows that is contained -in this input window. - -You might find this description very similar to what \texttt{groupby()} -and \texttt{agg()} methods do when combined together. And yes\ldots{} To -some extent, the idea of windows in a DataFrame is similar (but not -identical) to the idea of ``groups'' created by \emph{group by} -functions, such as the \texttt{DataFrame.groupby()} method from -\texttt{pyspark} (that we presented at Section~\ref{sec-group-by}), or -the -\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html}{\texttt{DataFrame.groupby()}}\footnote{\url{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html}} -method from \texttt{pandas}, and also, to -\href{https://dplyr.tidyverse.org/reference/group_by.html}{\texttt{dplyr::group\_by()}}\footnote{\url{https://dplyr.tidyverse.org/reference/group_by.html}} -from the \texttt{tidyverse} framework. You will see further in this -chapter how window functions differ from these operations. - -\section{How to define windows}\label{sec-window-def} - -In order to use a window function you need to define the windows of your -DataFrame first. You do this by creating a \texttt{Window} object in -your session. - -Every window object have two components, which are partitioning and -ordering, and you specify each of these components by using the -\texttt{partitionBy()} and \texttt{orderBy()} methods from the -\texttt{Window} class. In order to create a \texttt{Window} object, you -need to import the \texttt{Window} class from the -\texttt{pyspark.sql.window} module: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.window }\ImportTok{import}\NormalTok{ Window} -\end{Highlighting} -\end{Shaded} - -Over the next examples, I will be using the \texttt{transf} DataFrame -that we presented at Chapter~\ref{sec-transforming-dataframes-part1}. If -you don't remember how to import/get this DataFrame into your session, -come back to Section~\ref{sec-transf-dataframe}. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf.show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+------------+-------------+ -|dateTransfer| datetimeTransfer|clientNumber|transferValue| -+------------+-------------------+------------+-------------+ -| 2022-12-31|2022-12-31 14:00:24| 5516| 7794.31| -| 2022-12-31|2022-12-31 10:32:07| 4965| 7919.0| -| 2022-12-31|2022-12-31 07:37:02| 4608| 5603.0| -| 2022-12-31|2022-12-31 07:35:05| 1121| 4365.22| -| 2022-12-31|2022-12-31 02:53:44| 1121| 4620.0| -+------------+-------------------+------------+-------------+ -only showing top 5 rows -... with 6 more columns: transferCurrency, transferID, trans -ferLog, destinationBankNumber, destinationBankBranch, destin -ationBankAccount -\end{verbatim} - -Now, lets create a window object using the \texttt{transf} DataFrame as -our target. This DataFrame describes a set of transfers made in a -fictitious bank. So a reasonable way of splitting this DataFrame is by -day. That means that we can split this DataFrame into groups (or ranges) -of rows by using the \texttt{dateTransfer} column. As a result, each -partition in the \texttt{dateTransfer} column will create/identify a -different window in this DataFrame. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window.partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -The above window object specifies that each unique value present in the -\texttt{dateTransfer} column identifies a different window frame in the -\texttt{transf} DataFrame. Figure~\ref{fig-window-spec1} presents this -idea visually. So each partition in the \texttt{dateTransfer} column -creates a different window frame. And each window frame will become an -input to a window function (when we use one). - -\begin{figure} - -\centering{ - -\includegraphics[width=1\textwidth,height=\textheight]{Chapters/./../Figures/window-spec1.png} - -} - -\caption{\label{fig-window-spec1}Visualizing the window frames - Part 1} - -\end{figure}% - -Until this point, defining windows are very much like defining groups in -your DataFrame with \emph{group by} functions (i.e.~windows are very -similar to groups). But in the above example, we specified only the -partition component of the windows. The partitioning component of the -window object specifies which partitions of the DataFrame are translated -into windows. In the other hand, the ordering component of the window -object specifies how the rows within the window are ordered. - -Defining the ordering component becomes very important when we are -working with window functions that outputs (or that uses) indexes. As an -example, you might want to use in your calculations the first (or the -\emph{nth}) row in each window. In a situation like this, the order in -which these rows are founded inside the window affects directly the -output of your window function. That is why the ordering component -matters. - -For example, we can say that the rows within each window should be in -descending order according to the \texttt{datetimeTransfer} column: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ col} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(col(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{).desc())} -\end{Highlighting} -\end{Shaded} - -With the above snippet, we are not only specifying how the window frames -in the DataFrame are created (with the \texttt{partitionBy()}), but we -are also specifying how the rows within the window are sorted (with the -\texttt{orderBy()}). If we update our representation with the above -window specification, we get something similar to -Figure~\ref{fig-window-spec2}: - -\begin{figure} - -\centering{ - -\includegraphics[width=1\textwidth,height=\textheight]{Chapters/./../Figures/window-spec2.png} - -} - -\caption{\label{fig-window-spec2}Visualizing the window frames - Part 2} - -\end{figure}% - -Is worth mentioning that, both \texttt{partitionBy()} and -\texttt{orderBy()} methods accepts multiple columns as input. In other -words, you can use a combination of columns both to define how the -windows in your DataFrame will be created, and how the rows within these -windows will be sorted. - -As an example, the window specification below is saying: 1) that a -window frame is created for each unique combination of -\texttt{dateTransfer} and \texttt{clientNumber}; 2) that the rows within -each window are ordered accordingly to \texttt{transferCurrency} -(ascending order) and \texttt{datetimeTransfer} (descending order). - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}clientNumber\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferCurrency\textquotesingle{}}\NormalTok{).asc(),} -\NormalTok{ col(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{).desc()} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -\subsection{Partitioning or ordering or -none}\label{partitioning-or-ordering-or-none} - -Is worth mentioning that both partioning and ordering components of the -window specification \textbf{are optional}. You can create a window -object that contains only a partioning component defined, or, only a -ordering component, or, in fact, a window object that basically have -neither of them defined. - -As an example, all three objects below (\texttt{w1}, \texttt{w2} and -\texttt{w3}) are valid window objects. \texttt{w1} have only the -partition component defined, while \texttt{w2} have only the ordering -component defined. However, \texttt{w3} have basically none of them -defined, because \texttt{w3} is partitioned by nothing. In a situation -like this, a single window is created, and this window covers the entire -DataFrame. It covers all the rows at once. Is like you were not using -any window at all. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{w1 }\OperatorTok{=}\NormalTok{ Window.partitionBy(}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{)} -\NormalTok{w2 }\OperatorTok{=}\NormalTok{ Window.orderBy(}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{)} -\NormalTok{w3 }\OperatorTok{=}\NormalTok{ Window.partitionBy()} -\end{Highlighting} -\end{Shaded} - -So just be aware of this. Be aware that you can cover the entire -DataFrame into a single window. Be aware that if you use a window object -with neither components defined (\texttt{Window.partitionBy()}) your -window function basically works with the entire DataFrame at once. In -essence, this window function becomes similar to a normal aggregating -function. - -\section{\texorpdfstring{Introducing the \texttt{over()} -clause}{Introducing the over() clause}}\label{introducing-the-over-clause} - -In order to use a window function you \textbf{need to combine an over -clause with a window object}. If you pair these two components together, -then, the function you are using becomes a window function. - -Since we know now how to define window objects for our DataFrame, we can -actually create and use this object to access window functionality, by -pairing this window object with an \texttt{over()} clause. - -In \texttt{pyspark} this \texttt{over()} clause is actually a method -from the \texttt{Column} class. Since all aggregating functions -available from the \texttt{pyspark.sql.functions} module produces a new -\texttt{Column} object as output, we tend to use the \texttt{over()} -method right after the function call. - -For example, if we wanted to calculate the mean of \texttt{x} with the -\texttt{mean()} function, and we had a window object called -\texttt{window\_spec}, we could use the \texttt{mean()} as a window -function by writing -\texttt{mean(col(\textquotesingle{}x\textquotesingle{})).over(window\_spec)}. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.window }\ImportTok{import}\NormalTok{ Window} -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ mean, col} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}y\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}z\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}t\textquotesingle{}}\NormalTok{)} - -\NormalTok{mean(col(}\StringTok{\textquotesingle{}x\textquotesingle{}}\NormalTok{)).over(window\_spec)} -\end{Highlighting} -\end{Shaded} - -If you see this \texttt{over()} method after a call of an aggregating -function (such as \texttt{sum()}, \texttt{mean()}, etc.), then, you know -that this aggregating function is being called as a window function. - -The \texttt{over()} clause is also available in Spark SQL as the SQL -keyword \texttt{OVER}. This means that you can use window functions in -Spark SQL as well. But in Spark SQL, you write the window specification -inside parentheses after the \texttt{OVER} keyword, and you specify each -component with \texttt{PARTITION\ BY} AND \texttt{ORDER\ BY} keywords. -We could replicate the above example in Spark SQL like this: - -\begin{Shaded} -\begin{Highlighting}[] -\KeywordTok{SELECT}\NormalTok{ mean(x) }\KeywordTok{OVER}\NormalTok{ (}\KeywordTok{PARTITION} \KeywordTok{BY}\NormalTok{ y, z }\KeywordTok{ORDER} \KeywordTok{BY}\NormalTok{ t }\KeywordTok{ASC}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\section{\texorpdfstring{Window functions vs \emph{group by} -functions}{Window functions vs group by functions}}\label{window-functions-vs-group-by-functions} - -Despite their similarities, window functions and \emph{group by} -functions are used for different purposes. One big difference between -them, is that when you use \texttt{groupby()} + \texttt{agg()} you get -one output row per each input group of rows, but in contrast, a window -function outputs one row per input row. In other words, for a window of -\(n\) input rows a window function outputs \(n\) rows that contains the -same result (or the same aggregate result). - -For example, lets suppose you want to calculate the total value -transfered within each day. If you use a \texttt{groupby()} + -\texttt{agg()} strategy, you get as result a new DataFrame containing -one row for each unique date present in the \texttt{dateTransfer} -column: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import} \BuiltInTok{sum} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .agg(}\BuiltInTok{sum}\NormalTok{(col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)).alias(}\StringTok{\textquotesingle{}dayTotalTransferValue\textquotesingle{}}\NormalTok{))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+---------------------+ -|dateTransfer|dayTotalTransferValue| -+------------+---------------------+ -| 2022-01-01| 39630.7| -| 2022-01-02| 70031.46| -| 2022-01-03| 50957.869999999995| -| 2022-01-04| 56068.34| -| 2022-01-05| 47082.04| -+------------+---------------------+ -only showing top 5 rows -\end{verbatim} - -On the other site, if you use \texttt{sum()} as a window function -instead, you get as result one row for each transfer. That is, you get -one row of output for each input row in the \texttt{transf} DataFrame. -The value that is present in the new column created -(\texttt{dayTotalTransferValue}) is the total value transfered for the -window (or the range of rows) that corresponds to the date in the -\texttt{dateTransfer} column. - -In other words, the value \texttt{39630.7} below corresponds to the sum -of the \texttt{transferValue} column when -\texttt{dateTransfer\ ==\ "2022-01-01"}: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window.partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(} - \StringTok{\textquotesingle{}dayTotalTransferValue\textquotesingle{}}\NormalTok{,} - \BuiltInTok{sum}\NormalTok{(col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)).over(window\_spec)} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+----------+-------------+---------------------+ -|dateTransfer|transferID|transferValue|dayTotalTransferValue| -+------------+----------+-------------+---------------------+ -| 2022-01-01| 20221148| 5547.13| 39630.7| -| 2022-01-01| 20221147| 9941.0| 39630.7| -| 2022-01-01| 20221146| 5419.9| 39630.7| -| 2022-01-01| 20221145| 5006.0| 39630.7| -| 2022-01-01| 20221144| 8640.06| 39630.7| -+------------+----------+-------------+---------------------+ -only showing top 5 rows -\end{verbatim} - -You probably already seen this pattern in other data frameworks. As a -quick comparison, if you were using the \texttt{tidyverse} framework, -you could calculate the exact same result above with the following -snippet of R code: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf }\SpecialCharTok{|\textgreater{}} - \FunctionTok{group\_by}\NormalTok{(dateTransfer) }\SpecialCharTok{|\textgreater{}} - \FunctionTok{mutate}\NormalTok{(} - \AttributeTok{dayTotalTransferValue =} \FunctionTok{sum}\NormalTok{(transferValue)} -\NormalTok{ )} -\end{Highlighting} -\end{Shaded} - -In contrast, you would need the following snippet of Python code to get -the same result in the \texttt{pandas} framework: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{transf[}\StringTok{\textquotesingle{}dayTotalTransferValue\textquotesingle{}}\NormalTok{] }\OperatorTok{=}\NormalTok{ transf[}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{]}\OperatorTok{\textbackslash{}} -\NormalTok{ .groupby(transf[}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{])}\OperatorTok{\textbackslash{}} -\NormalTok{ .transform(}\StringTok{\textquotesingle{}sum\textquotesingle{}}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\section{Ranking window functions}\label{ranking-window-functions} - -The functions \texttt{row\_number()}, \texttt{rank()} and -\texttt{dense\_rank()} from the \texttt{pyspark.sql.functions} module -are ranking functions, in the sense that they seek to rank each row in -the input window according to a ranking system. These functions are -identical to their -\href{https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html\#function_row-number}{peers -in MySQL}\footnote{\url{https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html\#function_row-number}} -\texttt{ROW\_NUMBER()}, \texttt{RANK()} and \texttt{DENSE\_RANK()}. - -The function \texttt{row\_number()} simply returns a unique and -sequential number to each row in a window, starting from 1. It is a -quick way of marking each row with an unique and sequential number. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ row\_number} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{)} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}transferID\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}rowID\textquotesingle{}}\NormalTok{, row\_number().over(window\_spec))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------------+----------+-----+ -|dateTransfer| datetimeTransfer|transferID|rowID| -+------------+-------------------+----------+-----+ -| 2022-01-01|2022-01-01 03:56:58| 20221143| 1| -| 2022-01-01|2022-01-01 04:07:44| 20221144| 2| -| 2022-01-01|2022-01-01 09:00:18| 20221145| 3| -| 2022-01-01|2022-01-01 10:17:04| 20221146| 4| -| 2022-01-01|2022-01-01 16:14:30| 20221147| 5| -+------------+-------------------+----------+-----+ -only showing top 5 rows -\end{verbatim} - -The \texttt{row\_number()} function is also very useful when you are -trying to collect the rows in each window that contains the smallest or -biggest value in the window. If the ordering of your window -specification is in ascending order, then, the first row in the window -will contain the smallest value in the current window. In contrast, if -the ordering is in descending order, then, the first row in the window -will contain the biggest value in the current window. - -This is interesting, because lets suppose you wanted to find the rows -that contained the maximum transfer values in each day. A -\texttt{groupby()} + \texttt{agg()} strategy would tell you which are -the maximum transfer values in each day. But it would not tell you where -are the rows in the DataFrame that contains these maximum values. A -\texttt{Window} object + \texttt{row\_number()} + \texttt{filter()} can -help you to get this answer. - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).desc())} - -\CommentTok{\# The row with rowID == 1 is the first row in each window} -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}rowID\textquotesingle{}}\NormalTok{, row\_number().over(window\_spec))}\OperatorTok{\textbackslash{}} -\NormalTok{ .}\BuiltInTok{filter}\NormalTok{(col(}\StringTok{\textquotesingle{}rowID\textquotesingle{}}\NormalTok{) }\OperatorTok{==} \DecValTok{1}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}rowID\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}transferID\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-----+----------+-------------+ -|dateTransfer|rowID|transferID|transferValue| -+------------+-----+----------+-------------+ -| 2022-01-01| 1| 20221147| 9941.0| -| 2022-01-02| 1| 20221157| 10855.01| -| 2022-01-03| 1| 20221165| 8705.65| -| 2022-01-04| 1| 20221172| 9051.0| -| 2022-01-05| 1| 20221179| 9606.0| -+------------+-----+----------+-------------+ -only showing top 5 rows -\end{verbatim} - -The \texttt{rank()} and \texttt{dense\_rank()} functions are similar to -each other. They both rank the rows with integers, just like -\texttt{row\_number()}. But if there is a tie between two rows (that -means that both rows have the same value in the ordering column, so it -becomes a tie, we do not know which one of these rows should come -first), then, these functions will repeat the same number/index for -these rows in tie. Lets use the \texttt{df} below as a quick example: - -\begin{Shaded} -\begin{Highlighting}[] -\NormalTok{data }\OperatorTok{=}\NormalTok{ [} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{3000}\NormalTok{), (}\DecValTok{1}\NormalTok{, }\DecValTok{2400}\NormalTok{),} -\NormalTok{ (}\DecValTok{1}\NormalTok{, }\DecValTok{4200}\NormalTok{), (}\DecValTok{1}\NormalTok{, }\DecValTok{4200}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, }\DecValTok{1500}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{2000}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, }\DecValTok{3000}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{3000}\NormalTok{),} -\NormalTok{ (}\DecValTok{2}\NormalTok{, }\DecValTok{4500}\NormalTok{), (}\DecValTok{2}\NormalTok{, }\DecValTok{4600}\NormalTok{)} -\NormalTok{]} -\NormalTok{df }\OperatorTok{=}\NormalTok{ spark.createDataFrame(data, [}\StringTok{\textquotesingle{}id\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{])} -\end{Highlighting} -\end{Shaded} - -If we apply both \texttt{rank()} and \texttt{dense\_rank()} over this -DataFrame with the same window specification, we can see the difference -between these functions. In essence, \texttt{rank()} leave gaps in the -indexes that come right after any tied rows, while -\texttt{dense\_rank()} does not. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ rank, dense\_rank} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}id\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}value\textquotesingle{}}\NormalTok{)} - -\CommentTok{\# With rank() there are gaps in the indexes} -\NormalTok{df.withColumn(}\StringTok{\textquotesingle{}with\_rank\textquotesingle{}}\NormalTok{, rank().over(window\_spec))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+---------+ -| id|value|with_rank| -+---+-----+---------+ -| 1| 2400| 1| -| 1| 3000| 2| -| 1| 4200| 3| -| 1| 4200| 3| -| 2| 1500| 1| -| 2| 2000| 2| -| 2| 3000| 3| -| 2| 3000| 3| -| 2| 4500| 5| -| 2| 4600| 6| -+---+-----+---------+ -\end{verbatim} - -\begin{Shaded} -\begin{Highlighting}[] -\CommentTok{\# With dense\_rank() there are no gaps in the indexes} -\NormalTok{df.withColumn(}\StringTok{\textquotesingle{}with\_dense\_rank\textquotesingle{}}\NormalTok{, dense\_rank().over(window\_spec))}\OperatorTok{\textbackslash{}} -\NormalTok{ .show()} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+---+-----+---------------+ -| id|value|with_dense_rank| -+---+-----+---------------+ -| 1| 2400| 1| -| 1| 3000| 2| -| 1| 4200| 3| -| 1| 4200| 3| -| 2| 1500| 1| -| 2| 2000| 2| -| 2| 3000| 3| -| 2| 3000| 3| -| 2| 4500| 4| -| 2| 4600| 5| -+---+-----+---------------+ -\end{verbatim} - -\section{Agreggating window -functions}\label{agreggating-window-functions} - -In essence, all agreggating functions from the -\texttt{pyspark.sql.functions} module (like \texttt{sum()}, -\texttt{mean()}, \texttt{count()}, \texttt{max()} and \texttt{min()}) -can be used as a window function. So you can apply any agreggating -function as a window function. You just need to use the \texttt{over()} -clause with a \texttt{Window} object. - -We could for example see how much each \texttt{transferValue} deviates -from the daily mean of transfered value. This might be a valuable -information in case you are planning to do some statistical inference -over this data. Here is an example of what this would looks like in -\texttt{pyspark}: - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ mean} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window.partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)} - -\NormalTok{mean\_deviation\_expr }\OperatorTok{=}\NormalTok{ (} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)} - \OperatorTok{{-}}\NormalTok{ mean(col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)).over(window\_spec)} -\NormalTok{)} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}meanDeviation\textquotesingle{}}\NormalTok{, mean\_deviation\_expr)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------+-------------------+ -|dateTransfer|transferValue| meanDeviation| -+------------+-------------+-------------------+ -| 2022-01-01| 5547.13|-1057.9866666666658| -| 2022-01-01| 9941.0| 3335.883333333334| -| 2022-01-01| 5419.9|-1185.2166666666662| -| 2022-01-01| 5006.0|-1599.1166666666659| -| 2022-01-01| 8640.06| 2034.9433333333336| -+------------+-------------+-------------------+ -only showing top 5 rows -\end{verbatim} - -As another example, you might want to calculate how much a specific -transfer value represents of represents of the total amount transferred -daily. You could just get the total amount transferred daily by applying -the \texttt{sum()} function over windows partitioned by -\texttt{dateTransfer}. Then, you just need to divide the current -\texttt{transferValue} by the result of this \texttt{sum()} function, -and you get the proportion you are looking for. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import} \BuiltInTok{sum} -\NormalTok{proportion\_expr }\OperatorTok{=}\NormalTok{ (} -\NormalTok{ col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)} - \OperatorTok{/} \BuiltInTok{sum}\NormalTok{(col(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)).over(window\_spec)} -\NormalTok{)} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{, }\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}proportionDailyTotal\textquotesingle{}}\NormalTok{, proportion\_expr)}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+------------+-------------+--------------------+ -|dateTransfer|transferValue|proportionDailyTotal| -+------------+-------------+--------------------+ -| 2022-01-01| 5547.13| 0.1399705278988259| -| 2022-01-01| 9941.0| 0.25084088850310493| -| 2022-01-01| 5419.9| 0.1367601379738435| -| 2022-01-01| 5006.0| 0.1263162144499088| -| 2022-01-01| 8640.06| 0.2180143171833957| -+------------+-------------+--------------------+ -only showing top 5 rows -\end{verbatim} - -\section{\texorpdfstring{Getting the next and previous row with -\texttt{lead()} and -\texttt{lag()}}{Getting the next and previous row with lead() and lag()}}\label{getting-the-next-and-previous-row-with-lead-and-lag} - -There is one pair functions that is worth talking about in this chapter, -which are \texttt{lead()} and \texttt{lag()}. These functions are very -useful in the context of windows, because they return the value in the -next and previous rows considering your current position in your -DataFrame. - -These functions basically performs the same operation as their peers -\href{https://dplyr.tidyverse.org/reference/lead-lag.html}{\texttt{dplyr::lead()} -and \texttt{dplyr::lag()}}\footnote{\url{https://dplyr.tidyverse.org/reference/lead-lag.html}} -from the \texttt{tidyverse} framework. In essence, \texttt{lead()} will -return the value of the next row, while \texttt{lag()} will return the -value of the previous row. - -\begin{Shaded} -\begin{Highlighting}[] -\ImportTok{from}\NormalTok{ pyspark.sql.functions }\ImportTok{import}\NormalTok{ lag, lead} -\NormalTok{window\_spec }\OperatorTok{=}\NormalTok{ Window}\OperatorTok{\textbackslash{}} -\NormalTok{ .partitionBy(}\StringTok{\textquotesingle{}dateTransfer\textquotesingle{}}\NormalTok{)}\OperatorTok{\textbackslash{}} -\NormalTok{ .orderBy(}\StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{)} - -\NormalTok{lead\_expr }\OperatorTok{=}\NormalTok{ lead(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).over(window\_spec)} -\NormalTok{lag\_expr }\OperatorTok{=}\NormalTok{ lag(}\StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{).over(window\_spec)} - -\NormalTok{transf}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}nextValue\textquotesingle{}}\NormalTok{, lead\_expr)}\OperatorTok{\textbackslash{}} -\NormalTok{ .withColumn(}\StringTok{\textquotesingle{}previousValue\textquotesingle{}}\NormalTok{, lag\_expr)}\OperatorTok{\textbackslash{}} -\NormalTok{ .select(} - \StringTok{\textquotesingle{}datetimeTransfer\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}transferValue\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}nextValue\textquotesingle{}}\NormalTok{,} - \StringTok{\textquotesingle{}previousValue\textquotesingle{}} -\NormalTok{ )}\OperatorTok{\textbackslash{}} -\NormalTok{ .show(}\DecValTok{5}\NormalTok{)} -\end{Highlighting} -\end{Shaded} - -\begin{verbatim} -+-------------------+-------------+---------+-------------+ -| datetimeTransfer|transferValue|nextValue|previousValue| -+-------------------+-------------+---------+-------------+ -|2022-01-01 03:56:58| 5076.61| 8640.06| NULL| -|2022-01-01 04:07:44| 8640.06| 5006.0| 5076.61| -|2022-01-01 09:00:18| 5006.0| 5419.9| 8640.06| -|2022-01-01 10:17:04| 5419.9| 9941.0| 5006.0| -|2022-01-01 16:14:30| 9941.0| 5547.13| 5419.9| -+-------------------+-------------+---------+-------------+ -only showing top 5 rows -\end{verbatim} - -\bookmarksetup{startatroot} - -\chapter*{References}\label{references} -\addcontentsline{toc}{chapter}{References} - -\markboth{References}{References} - -\phantomsection\label{refs} -\begin{CSLReferences}{1}{0} -\bibitem[\citeproctext]{ref-sparkdoc} -\emph{Apache Spark Official Documentation}. 2022. Documentation for -Apache Spark 3.2.1. \url{https://spark.apache.org/docs/latest/}. - -\bibitem[\citeproctext]{ref-chambers2018} -Chambers, Bill, and Matei Zaharia. 2018. \emph{Spark: The Definitive -Guide: Big Data Processing Made Simple}. Sebastopol, CA: O'Reilly Media. - -\bibitem[\citeproctext]{ref-damji2020} -Damji, Jules, Brooke Wenig, Tathagata Das, and Denny Lee. 2020. -\emph{Learning Spark: Lightning-Fast Data Analytics}. Sebastopol, CA: -O'Reilly Media. - -\bibitem[\citeproctext]{ref-regexinfo} -Goyvaerts, Jan. 2023. {``Regular-Expressions.info.''} -\url{https://www.regular-expressions.info/}. - -\bibitem[\citeproctext]{ref-karau2015} -Karau, Holden, Andy Konwinski, Patrick Wendell, and Matei Zaharia. 2015. -\emph{Learning Spark: Lightning-Fast Data Analytics}. Sebastopol, CA: -O'Reilly Media. - -\end{CSLReferences} - - - -\end{document}