Skip to content

Commit

Permalink
Merge branch 'master' of github.com:michbur/biogram
Browse files Browse the repository at this point in the history
  • Loading branch information
michbur committed Jan 5, 2017
2 parents 152de7e + 7bf46f7 commit 2f5edf1
Show file tree
Hide file tree
Showing 11 changed files with 333 additions and 154 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(calc_cs)
export(calc_ed)
export(calc_ig)
export(calc_kl)
export(calc_pi)
export(calc_si)
export(cluster_reg_exp)
export(code_ngrams)
Expand All @@ -20,6 +21,7 @@ export(count_multigrams)
export(count_ngrams)
export(count_specified)
export(count_total)
export(create_encoding)
export(create_feature_target)
export(create_ngrams)
export(decode_ngrams)
Expand Down Expand Up @@ -50,7 +52,10 @@ importFrom(graphics,par)
importFrom(graphics,plot)
importFrom(graphics,points)
importFrom(partitions,listParts)
importFrom(stats,cutree)
importFrom(stats,dist)
importFrom(stats,dmultinom)
importFrom(stats,hclust)
importFrom(stats,p.adjust)
importFrom(utils,combn)
importFrom(utils,setTxtProgressBar)
Expand Down
205 changes: 93 additions & 112 deletions R/calc_ed.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@
#' of groups or less than \code{a}. Both \code{a} and {b} must have the the same
#' number of elements.
#' @param prop \code{matrix} of physicochemical properties to normalize the
#' encoding distance. Must have either four or 20 columns and each column should
#' encoding distance. Each column should
#' represent properties of the single amino acid/nucleotide. If \code{NULL},
#' encoding distance is not normalized.
#' @details The encoding distance between \code{a} and \code{b} is defined as the
#' minimum number of amino acids that have to be moved between subgroups of encoding
#' to make \code{a} identical to \code{b} (order of subgroups in the encoding and amino
#' acids in a group is unimportant).
#'
#' @param measure \code{character} vector of length one specifying the measure.
#' Currently avaible measures are \code{"pi"} (partition index) and
#' \code{"si"} (similarity index).
#' If the parameter \code{prop} is supplied, the encoding distance is normalized by the
#' factor equal to the sum of distances for each group in \code{a} and the closest group
#' in \code{b}. The position of a group is defined as the mean value of properties of
Expand All @@ -26,8 +24,6 @@
#' \code{\link{encoding2df}}: converts an encoding to a data frame.
#' \code{\link{validate_encoding}}: validate a structure of an encoding.
#' @return an encoding distance.
#' @importFrom partitions listParts
#' @importFrom combinat permn
#' @export
#' @examples
#' # calculate encoding distance between two encodings of amino acids
Expand All @@ -39,13 +35,24 @@
#' aa2 = list(`1` = c("g", "a", "p", "v", "m", "l", "q"),
#' `2` = c("k", "h", "d", "e", "i"),
#' `3` = c("f", "r", "w", "y", "s", "t", "c", "n"))
#' calc_ed(aa1, aa2)
#' calc_ed(aa1, aa2, measure = "pi")
#'
#' # the encoding distance between two identical encodings is 0
#' calc_ed(aa1, aa1)
#'
#' calc_ed(aa1, aa1, measure = "pi")

calc_ed <- function(a, b, prop = NULL) {
calc_ed <- function(a, b, prop = NULL, measure) {
ed_function <- switch(measure,
si = calc_si_hidden,
pi = calc_pi_hidden)

if(!is.null(prop)) {
if(!is.matrix(prop))
stop("'prop' must have 'matrix' class.")

if(!(ncol(prop) %in% c(4, 20)))
stop("'prop' must have 4 or 20 columns.")
}

# compare temporary a to temporary b
if(length(a) < length(b)) {
warning("'a' must be longer than 'b'. Reverting a and b.")
Expand All @@ -65,13 +72,77 @@ calc_ed <- function(a, b, prop = NULL) {
if(!validate_encoding(ta, unlist(tb)))
stop("Encodings ('a' and 'b') must contain the same elements.")

ed <- ed_function(ta, tb)

if(!is.null(prop)) {
if(!is.matrix(prop))
stop("'prop' must have 'matrix' class.")
coords_a <- lapply(a, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
coords_b <- lapply(b, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))

if(!(ncol(prop) %in% c(4, 20)))
stop("'prop' must have 4 or 20 columns.")
norm_factor <- sum(sapply(coords_a, function(single_coords_a) {
distances <- sapply(coords_b, function(single_coords_b)
# vector of distances between groups
sqrt(sum((single_coords_a - single_coords_b)^2))
)
# c(dist = min(distances), id = unname(which.min(distances)))
min(distances)
}))

ed <- ed * norm_factor
}

unname(ed)
}


#' Calculate partition index
#'
#' Computes the encoding distance between two encodings.
#' @param a encoding (see \code{\link{validate_encoding}} for more information
#' about the required structure of encoding).
#' @param b encoding to which \code{a} should be compared. Must have equal number
#' of groups or less than \code{a}. Both \code{a} and {b} must have the the same
#' number of elements.
#' @details The encoding distance between \code{a} and \code{b} is defined as the
#' minimum number of amino acids that have to be moved between subgroups of encoding
#' to make \code{a} identical to \code{b} (order of subgroups in the encoding and amino
#' acids in a group is unimportant).
#'
#' If the parameter \code{prop} is supplied, the encoding distance is normalized by the
#' factor equal to the sum of distances for each group in \code{a} and the closest group
#' in \code{b}. The position of a group is defined as the mean value of properties of
#' amino acids or nucleotides belonging the group.
#'
#' See the package vignette for more details.
#' @seealso
#' \code{\link{calc_si}}: compute the similarity index of two encodings.
#' \code{\link{encoding2df}}: converts an encoding to a data frame.
#' \code{\link{validate_encoding}}: validate a structure of an encoding.
#' @return an encoding distance.
#' @importFrom partitions listParts
#' @importFrom combinat permn
#' @export
#' @examples
#' # calculate encoding distance between two encodings of amino acids
#' aa1 = list(`1` = c("g", "a", "p", "v", "m", "l", "i"),
#' `2` = c("k", "h"),
#' `3` = c("d", "e"),
#' `4` = c("f", "r", "w", "y", "s", "t", "c", "n", "q"))
#'
#' aa2 = list(`1` = c("g", "a", "p", "v", "m", "l", "q"),
#' `2` = c("k", "h", "d", "e", "i"),
#' `3` = c("f", "r", "w", "y", "s", "t", "c", "n"))
#' calc_pi(aa1, aa2)
#'
#' # the encoding distance between two identical encodings is 0
#' calc_pi(aa1, aa1)
#'

calc_pi <- function(a, b) {
calc_ed(a, b, measure = "pi")
}

calc_pi_hidden <- function(ta, tb) {

# encoding distance - distance between encodings
ed <- 0

Expand Down Expand Up @@ -102,7 +173,7 @@ calc_ed <- function(a, b, prop = NULL) {
} else {

merges <- create_merges(len_a, len_b)

# list of merged
reduced_size <- lapply(merges, function(single_merge) {
ed <- sum(sapply(single_merge, function(i) {
Expand All @@ -128,23 +199,7 @@ calc_ed <- function(a, b, prop = NULL) {
calc_ed_single(ta, tb)
}

if(!is.null(prop)) {
coords_a <- lapply(a, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
coords_b <- lapply(b, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))

norm_factor <- sum(sapply(coords_a, function(single_coords_a) {
distances <- sapply(coords_b, function(single_coords_b)
# vector of distances between groups
sqrt(sum((single_coords_a - single_coords_b)^2))
)
# c(dist = min(distances), id = unname(which.min(distances)))
min(distances)
}))

ed <- ed * norm_factor
}

unname(ed)
ed
}

calc_ed_single <- function(ta, tb, ed = 0) {
Expand All @@ -169,90 +224,16 @@ calc_ed_single <- function(ta, tb, ed = 0) {
}))
}


#' Validate encoding
#'
#' Checks the structure of an encoding.
#' @param x encoding.
#' @param u \code{integer}, \code{numeric} or \code{character} vector of all
#' elements belonging to the encoding. See Details.
#' @keywords manip
#' @return \code{TRUE} if the \code{x} is a correctly reduced \code{u},
#' \code{FALSE} in any other cases.
#' @details The encoding is a list of groups to which elements of an alphabet
#' should be reduced. All elements of the alphabet (all
#' amino acids or all nucleotides) should appear in the encoding.
#' @export
#' @keywords manip
#' @seealso
#' \code{\link{calc_ed}}: calculate the encoding distance between two encodings.
#' \code{\link{encoding2df}}: converts an encoding to a data frame.
#' @examples
#' enc1 = list(`1` = c("a", "t"),
#' `2` = c("g", "c"))
#' # see if enc1 is the correctly reduced nucleotide (DNA) alphabet
#' validate_encoding(enc1, c("a", "c", "g", "t"))
#'
#' # enc1 is not the RNA alphabet, so the results is FALSE
#' validate_encoding(enc1, c("a", "c", "g", "u"))
#'
#' # validate_encoding works also on other notations
#' enc2 = list(a = c(1, 4),
#' b = c(2, 3))
#' validate_encoding(enc2, 1L:4)

validate_encoding <- function(x, u) {
if(!is.list(x))
stop("'x' must have 'list' class.")
all(sort(unlist(x)) == sort(u))
}



create_merges <- function(x, n_groups) {
all_merges <- listParts(x)
# all partitions of x of the proper length
chosen_merges <- all_merges[lengths(all_merges) == n_groups]

# all permutations of a single partition
perms <- permn(n_groups)

unlist(lapply(chosen_merges, function(single_merge) {
lapply(perms, function(single_perm)
unname(single_merge)[single_perm])
}), recursive = FALSE)
}

#' Convert encoding to data frame
#'
#' Converts an encoding to a data frame.
#' @param x encoding.
#' @param sort if \code{TRUE} rows are sorted according to elements.
#' @keywords manip
#' @return data frame with two columns. First column represents an index of a
#' group in the supplied encoding and the second column contains all elements of
#' the encoding.
#' @details The encoding is a list of groups to which elements of an alphabet
#' should be reduced. All elements of the alphabet (all
#' amino acids or all nucleotides) should appear in the encoding.
#' @export
#' @keywords manip
#' @seealso
#' \code{\link{calc_ed}}: calculate the encoding distance between two encodings.
#' \code{\link{validate_encoding}}: validate a structure of an encoding.
#' @examples
#' enc1 = list(`1` = c("a", "t"),
#' `2` = c("g", "c"))
#' encoding2df(enc1)


encoding2df <- function(x, sort = FALSE) {
res <- do.call(rbind, lapply(1L:length(x), function(gr_id) {
data.frame(gr_id = gr_id, element = x[[gr_id]])
}))

if(sort)
res <- res[order(levels(res[["element"]])), ]

res
lapply(perms, function(single_perm)
unname(single_merge)[single_perm])
}), recursive = FALSE)
}
7 changes: 4 additions & 3 deletions R/calc_si.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
#' calc_si(enc1, enc3)

calc_si <- function(a, b) {
if(!validate_encoding(a, unlist(b)))
stop("Encodings ('a' and 'b') must contain the same elements.")

calc_ed(a, b, measure = "si", prop = NULL)
}

calc_si_hidden <- function(a, b) {
# similarity matrix
comp <- encoding2matrix(a) == encoding2matrix(b)
diag(comp) <- 0
Expand Down
Loading

0 comments on commit 2f5edf1

Please sign in to comment.