runoff.Rnw

\documentclass[12pt]{article}
\usepackage{geometry}                % See geometry.pdf to learn the layout options. There are lots.
\geometry{letterpaper}                   % ... or a4paper or a5paper or ... 
%\geometry{landscape}                % Activate for for rotated page geometry
\usepackage[parfill]{parskip}    % Activate to begin paragraphs with an empty line rather than an indent
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{subfig}
\usepackage{verbatim}
\usepackage{fullpage}
\usepackage{pstricks,pst-node,pst-tree}


\title{Breakpoints for farm field runoff}
\author{Wesley Brooks}
\date{}                                           % Activate to display a given date or no date

\begin{document}
\setkeys{Gin}{width=0.9\textwidth}    %make figures a bit wider than the Sweave default.
\maketitle


<<label=read_data, echo=FALSE, include=FALSE, keep.source=True>>=
    setwd('c:/Users/wrbrooks/git/disco_farms')
    data_file = 'Dataset020312long.csv'
    data = read.csv(data_file, header=T, na.strings=c('NA', 'na', '', "-9"))

    farms = levels(data$loc)
    sites = levels(data$site)
    farm_names = list(DF3="DF3", DF1="DF1", DF2="DF2", DF1a="DF1a", DF1b="DF1b", DF2a="DF2a", DF2b="DF2b", DF2c="DF2c")
@


<<label=piecewise, results=hide, echo=FALSE, include=FALSE, keep.source=True>>=
    piecewise <- function(th, x, y) { 
        X <- cbind( ifelse(x>=th, 1, 0), x, pmax(0, x-th) )
        fit = lsfit(X, y) #actually fit the model
        
        #Counts how many observations lie above and below the breakpoint
        upper = as.logical( ifelse(x-th>=0, 1, 0) )
        n_upper = sum(upper)
        n_lower = dim(X)[1] - n_upper
    
        #Sum of squared residuals on either side of the breakpoint
        ssr_upper = sum( fit$resid[upper]**2 )
        ssr_lower = sum( fit$resid[!upper]**2 )
    
        #Return the residual standard error:
        return( n_lower*sqrt(ssr_lower/(n_lower-2)) + n_upper*sqrt(ssr_upper/(n_upper-2)) )
    }

    piecewise_conts <- function(th, x, y) { 
        X <- cbind( x, pmax(0, x-th) )
        fit = lsfit(X, y) #actually fit the model
    
        #Counts how many observations lie above and below the breakpoint
        upper = as.logical( ifelse(x-th>=0, 1, 0) )
        n_upper = sum(upper)
        n_lower = dim(X)[1] - n_upper
    
        #Sum of squared residuals on either side of the breakpoint
        ssr_upper = sum( fit$resid[upper]**2 )
        ssr_lower = sum( fit$resid[!upper]**2 )
        
        #Return the residual standard error:
        return( n_lower*sqrt(ssr_lower/(n_lower-2)) + n_upper*sqrt(ssr_upper/(n_upper-2)) )
    }

    piecewise_disjoint <- function(th, x, y) { 
        X <- cbind( ifelse(x>=th, 1, 0), x, pmax(0, x-th) )
        fit = lsfit(X, y) #actually fit the model
        
        #Counts how many observations lie above and below the breakpoint
        upper = as.logical( ifelse(x-th>=0, 1, 0) )
        n_upper = sum(upper)
        n_lower = dim(X)[1] - n_upper
    
        #Sum of squared residuals on either side of the breakpoint
        ssr_upper = sum( fit$resid[upper]**2 )
        ssr_lower = sum( fit$resid[!upper]**2 )
    
        #Return the residual standard error:
        return( n_lower*sqrt(ssr_lower/(n_lower-2)) + n_upper*sqrt(ssr_upper/(n_upper-2)) )
    }
    
    piecewise_constant <- function(th, x, y) { 
        X <- ifelse(x>=th, 1, 0)
        fit = lsfit(X, y) #actually fit the model
        
        #Counts how many observations lie above and below the breakpoint
        upper = as.logical( ifelse(x-th>=0, 1, 0) )
        n_upper = sum(upper)
        n_lower = dim(X)[1] - n_upper
    
        #Sum of squared residuals on either side of the breakpoint
        ssr_upper = sum( fit$resid[upper]**2 )
        ssr_lower = sum( fit$resid[!upper]**2 )
    
        #Return the residual standard error:
        return( n_lower*sqrt(ssr_lower/(n_lower-2)) + n_upper*sqrt(ssr_upper/(n_upper-2)) )
    }
@

<<echo=False, include=False>>=
    anova.bp <- function(data, split.on, site=NA, conts=FALSE, discrete.xrange=FALSE, xrange=NA, min.in.node=4, return.models=FALSE, return.X=FALSE) {         
        #First, remove datapoints that have NA for the splitting variable:
        data = data[!is.na(data[,split.on]),]
        
        #Now generate the matrix of predictors (i.e. the X matrix):
        if(!conts) {
            if(discrete.xrange)
                th <- which.min( sapply(X=xrange, FUN=piecewise_disjoint, x=data[data$site==site,split.on], y=data[data$site==site,]$rc) ) + xrange[1]-1
            else {
                lower_lim = sort( data[data$site==site,split.on] )[min.in.node]
                upper_lim = sort(data[data$site==site,split.on], decreasing=T )[min.in.node]
                th <- optimize(piecewise_disjoint, upper=upper_lim, lower=lower_lim, x=data[data$site==site,split.on], y=data[data$site==site,]$rc)$minimum
            }
        
            X.bp = cbind( ifelse(data[data$site==site,split.on]>=th, 1, 0), data[data$site==site,split.on], pmax(0, data[data$site==site,split.on]-th) )
        }
        else {
            if(discrete.xrange)
                th <- which.min( sapply(X=xrange, FUN=piecewise_conts, x=data[data$site==site,split.on], y=data[data$site==site,]$rc) ) + xrange[1]-1
            else {
                lower_lim = sort( data[data$site==site,split.on] )[min.in.node]
                upper_lim = sort(data[data$site==site,split.on], decreasing=T )[min.in.node]
                th <- optimize(piecewise_conts, upper=upper_lim, lower=lower_lim, x=data[data$site==site,split.on], y=data[data$site==site,]$rc)$minimum
            }
        
            X.bp = cbind( data[data$site==site,split.on], pmax(0, data[data$site==site,split.on]-th) )   
        }
            
        X.line = data[data$site==site,split.on]
        y = data[data$site==site,]$rc
       
        if(return.X)
            return(list(line=X.line, bp=X.bp, theta=th))
        
        fit.bp = lm(y~X.bp)
        fit.line = lm(y~X.line)
        
        if(return.models)
            return(list(bp=fit.bp, line=fit.line, theta=th))
        
        #Calculate the p-value for the hypothesis of no difference between the two lines:     
        pval = anova(fit.bp, fit.line)[["Pr(>F)"]][2]

        #Return the p-value in a legible format:
        if (pval < 0.0001)
            return("p < 0.0001")
        else
            return(paste("p = ", round(pval,4), sep=""))
    }
@


<<label=piecewise_plot, include=False, echo=FALSE, keep.source=True>>=
    piecewise_plot <- function(x,y,th, xlim=FALSE, ylim=FALSE, xlab='', ylab='', title='', p=1) {
        #define the limits of the plot
        if( identical(xlim, FALSE) ) { xx = range(x, na.rm=TRUE) }
        else { xx = xlim }
    
        if( identical(ylim, FALSE) ) { yy = range(y, na.rm=TRUE) }
        else { yy = ylim }
        
        #put the data points on the plot
        plot(x, y, xlab=xlab, ylab=ylab, xlim=xx, ylim=yy, pch=20, bty='n', xaxt='s', yaxt='s', ann=T, main=title) 
    
        #create the piecewise regression model
        X <- cbind(ifelse(x>=th, 1, 0), x, pmax(0, x-th))
        fit = lsfit(X, y) #actually fit the model
    
        #extent of the lower, upper pieces:
        xints = c( xx[1], th, xx[2] )

        #draw the lower regression line
        xints_low = xints[1:2]
        yints_low = fit$coef[1] + (fit$coef[3]*xints_low)
        par(new=T, ann=F, xaxt='n', yaxt='n', bty='n')
        plot(xints_low, yints_low, type='l', xlim=xx, ylim=yy)
    
        #draw the upper regression line
        xints_high = xints[2:3]
        yints_high = fit$coef[1] + fit$coef[2] + (fit$coef[3]*xints_high) + (fit$coef[4]*(xints_high - th))
        par(new=T, ann=F, xaxt='n', yaxt='n')
        plot(xints_high, yints_high, type='l', xlim=xx, ylim=yy) 
    
        #draw a vertical line to separate the two regimes
        abline(v=th, lty=3) 
        text(x=th, y=0.85, pos=4, labels=paste("breakpoint: ", round(th,2), "\n", p, sep=""))
    }
@

The piecewise models we use here allow a seaparate slope and intercept on either side of the breakpoint. The breakpoint chosen is the one that minimizes the residual standard error, with the condition that there must be at least four data points on both sides of the breakpoint. The p-values cited in this paper are based on an F test of the improvement in model fit between the piecewise model (with four parameters: a slope and an intercept for each line segment) and a linear model (with two parameters: one slope and one intercept covering the entire range of data).

<<label=soil moisture breakpoints, echo=FALSE, include=FALSE, keep.source=True>>=
    xs = seq(33, 43)
    sm.breakpoints = vector()
    sm.pval = list()
    breakpoints = list()
    fit = list()
    cat("Soil moisture breakpoints by farm:\n")

    for(site in sites) {
        farm = data[data$site==site,]
        th <- which.min( sapply(X=xs, FUN=piecewise, x=farm$sm, y=farm$rc) ) + 32
    
        sm.breakpoints = c(sm.breakpoints, th)
        breakpoints[[site]] = th
        sm.pval[[site]] = anova.bp(data, split.on="sm", site=site, discrete.xrange=TRUE, xrange=xs, min.in.node=4)
        
        X <- cbind( ifelse(farm$sm>=th, 1, 0), farm$sm, pmax(0, farm$sm-th) )
        fit[[site]] = lsfit(x=X, y=farm$rc)
        cat( paste(site, ": ", th, "; ", sm.pval[[site]], "\n", sep=""))
    }
@


<<label=sm_plots, echo=FALSE, include=FALSE, keep.source=True>>=
    layout(matrix(1:8,4,2))
    titles = c(sites)
    for( i in 1:length(sites) ) {
        farm = data[data$site==sites[i],]
        piecewise_plot(x=farm$sm, y=farm$rc, th=sm.breakpoints[i],
                         ylim=range(data$rc, na.rm=T), xlim=range(data$sm, na.rm=T),
                         xlab="soil moisture", ylab="runoff coefficient",
                         title=farm_names[[sites[i]]],
                         p=sm.pval[[sites[i]]])
    }
@


\begin{figure}
    \begin{center}
<<label=sm, fig=True, echo=False, width=7, height=9>>=
<<sm_plots>>
@
    \end{center}
    \caption{caption.\label{sm}}
\end{figure}


Now get the I30 breakpoints:\\

<<label=I30_breakpoints, echo=FALSE>>=
    I30.pval = list()
    cat("I30 breakpoints by farm:\n")

    for(site in sites) {
        farm = data[data$site==site & !is.na(data$I30),]
        lower_lim = sort( farm$I30 )[4]
        upper_lim = sort(farm$I30, decreasing=T )[4]
        th <- optimize(piecewise, c(upper_lim, lower_lim), x=farm$I30, y=farm$rc)$minimum
        I30.pval[[site]] = anova.bp(data, split.on="I30", site=site, discrete.xrange=FALSE, min.in.node=4)
        cat( paste(site, ": ", round(th,1), "; p=", I30.pval[[site]], "\n", sep=""))
    }
@

Now split the data into two bins based on the soil moisure breakpoint (above vs. below the breakpoint) and find the I30 breakpoint in each bin:\\


<<label=I30_by_binned_SM, echo=False>>=
    I30.breakpoints = list()
    limits = list()
    I30.pval = list()

    for(site in farms) {
        limits[[site]] = c(0, breakpoints[[site]], Inf)
        site_data = data[data$site==site,]
        bp = vector()
        cat(paste("Intensity breakpoints at ", site, " when binned by soil moisture:\n", sep=""))
        
        for(i in 1:2) {
            bin = site_data[site_data$sm>=limits[[site]][i] & site_data$sm<limits[[site]][i+1] & !is.na(site_data$I30),]
            lower_lim = sort( bin$I30 )[4]
            upper_lim = sort(bin$I30, decreasing=T )[4]
            th <- optimize(piecewise, c(upper_lim, lower_lim), x=bin$I30, y=bin$rc)$minimum
            I30.pval[[site]][i] = anova.bp(data[data$sm>=limits[[site]][i] & 
                data$sm<limits[[site]][i+1] & !is.na(data$I30),], split.on="I30",
                site=site, discrete.xrange=FALSE, min.in.node=4)
            cat( paste(limits[[site]][i], " <= SM < ", limits[[site]][i+1], ": ", round(th,1), "; p=", I30.pval[[site]][i], "\n", sep=""))
            bp = c(bp, th)
        }
        I30.breakpoints[[site]] = bp
        cat("\n")
    }
@

<<label=binned_I30_plots, include=False, echo=False>>=
    layout(t(matrix(1:6,2,3)))

    for(site in farms) {
        titles = c(paste(farm_names[[site]], " (SM < ", limits[[site]][2], "%)", sep=""), paste(farm_names[[site]], " (", limits[[site]][2], "% <= SM)", sep=""))
        site_data = data[data$site==site,]
        
        for(i in 1:2) {
            bin = site_data[site_data$sm>=limits[[site]][i] & site_data$sm<limits[[site]][i+1] & !is.na(site_data$I30),]
            piecewise_plot(x=bin$I30,
                             y=bin$rc,
                             th=I30.breakpoints[[site]][i], 
                             ylim=range(data$rc, na.rm=T),
                             xlim=range(data$I30, na.rm=T),
                             xlab="I30",
                             ylab="runoff coefficient",
                             title=titles[i],
                             p=I30.pval[[site]][i])
        }
    }
@


\begin{figure}
    \begin{center}
<<label=I30_binned, fig=True, echo=False, width=7, height=8>>=
<<binned_I30_plots>>
@
    \end{center}
    \caption{Top row: DF1, middle row: DF2, bottom row: DF3.\label{I30_binned}}
\end{figure}


When we put the events in bins based on their antecedent soil moisture (SM: high or low), the following are the precipitation breakpoints (units are centimeters of rain):\\

<<label=precip_by_binned_SM, include=False, echo=False>>=
    precip.breakpoints = list()
    precip.pval = list()
    
    for(site in farms) {
        site_data = data[data$site==site,]
        bp = vector()
        cat(paste("Precipitation breakpoints at ", site, " when binned by soil moisture:\n", sep=""))
            
        for(i in 1:2) {
            bin = site_data[site_data$sm>=limits[[site]][i] & site_data$sm<limits[[site]][i+1] & !is.na(site_data$prec),]
            lower_lim = sort( bin$prec )[4]
            upper_lim = sort(bin$prec, decreasing=T )[4]
            th <- optimize(piecewise, c(upper_lim, lower_lim), x=bin$prec, y=bin$rc)$minimum
            precip.pval[[site]][i] = anova.bp(data[data$sm>=limits[[site]][i] & 
                    data$sm<limits[[site]][i+1] & !is.na(data$prec),], split.on="prec",
                    site=site, discrete.xrange=FALSE, min.in.node=4)
            cat( paste(limits[[site]][i], " <= SM < ", limits[[site]][i+1], ": ", round(th,2), "; p=", precip.pval[[site]][i], "\n", sep=""))
            bp = c(bp, th)
        }
        precip.breakpoints[[site]] = bp
        cat("\n")
    }
@


<<label=binned_precip_plots, include=False, echo=False>>=
    layout(t(matrix(1:6,2,3)))

    for(site in farms) {    
        titles = c(paste(farm_names[[site]], " (SM < ", limits[[site]][2], "%)", sep=""), paste(farm_names[[site]], " (", limits[[site]][2], "% <= SM)", sep=""))
        site_data = data[data$site==site,]
        
        for(i in 1:2) {
            bin = site_data[site_data$sm>=limits[[site]][i] & site_data$sm<limits[[site]][i+1] & !is.na(site_data$prec),]
            piecewise_plot(x=bin$prec,
                             y=bin$rc, 
                             th=precip.breakpoints[[site]][i],
                             ylim=range(data$rc, na.rm=T),
                             xlim=range(data$prec, na.rm=T),
                             xlab="precip",
                             ylab="runoff coefficient",
                             title=titles[i],
                             p=precip.pval[[site]][i])
        }
    }
@


\begin{figure}
    \begin{center}
<<label=precip_binned, fig=True, echo=False, width=7, height=8>>=
<<binned_precip_plots>>
@
    \end{center}
    \caption{Top row: DF1, middle row: DF1, bottom row: DF1.\label{precip_binned}}
\end{figure}

\end{document}