Merge branch 'master' of github.com:michbur/biogram

michbur · Jan 5, 2017 · 2f5edf1 · 2f5edf1
2 parents 152de7e + 7bf46f7
commit 2f5edf1
Show file tree

Hide file tree

Showing 11 changed files with 333 additions and 154 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(calc_cs)
 export(calc_ed)
 export(calc_ig)
 export(calc_kl)
+export(calc_pi)
 export(calc_si)
 export(cluster_reg_exp)
 export(code_ngrams)
@@ -20,6 +21,7 @@ export(count_multigrams)
 export(count_ngrams)
 export(count_specified)
 export(count_total)
+export(create_encoding)
 export(create_feature_target)
 export(create_ngrams)
 export(decode_ngrams)
@@ -50,7 +52,10 @@ importFrom(graphics,par)
 importFrom(graphics,plot)
 importFrom(graphics,points)
 importFrom(partitions,listParts)
+importFrom(stats,cutree)
+importFrom(stats,dist)
 importFrom(stats,dmultinom)
+importFrom(stats,hclust)
 importFrom(stats,p.adjust)
 importFrom(utils,combn)
 importFrom(utils,setTxtProgressBar)

diff --git a/R/calc_ed.R b/R/calc_ed.R
@@ -7,14 +7,12 @@
 #' of groups or less than \code{a}. Both \code{a} and {b} must have the the same 
 #' number of elements.
 #' @param prop \code{matrix} of physicochemical properties to normalize the 
-#' encoding distance.  Must have either four or 20 columns and each column should 
+#' encoding distance.  Each column should 
 #' represent properties of the single amino acid/nucleotide. If \code{NULL},
 #' encoding distance is not normalized.
-#' @details The encoding distance between \code{a} and \code{b} is defined as the 
-#' minimum number of amino acids that have to be moved between subgroups of encoding 
-#' to make \code{a} identical to \code{b} (order of subgroups in the encoding and amino 
-#' acids in a group is unimportant).
-#' 
+#' @param measure \code{character} vector of length one specifying the measure. 
+#' Currently avaible measures are \code{"pi"} (partition index) and 
+#' \code{"si"} (similarity index).
 #' If the parameter \code{prop} is supplied, the encoding distance is normalized by the 
 #' factor equal to the sum of distances for each group in \code{a} and the closest group 
 #' in \code{b}. The position of a group is defined as the mean value of properties of 
@@ -26,8 +24,6 @@
 #' \code{\link{encoding2df}}: converts an encoding to a data frame.
 #' \code{\link{validate_encoding}}: validate a structure of an encoding.
 #' @return an encoding distance.
-#' @importFrom partitions listParts
-#' @importFrom combinat permn
 #' @export
 #' @examples
 #' # calculate encoding distance between two encodings of amino acids
@@ -39,13 +35,24 @@
 #' aa2 = list(`1` = c("g", "a", "p", "v", "m", "l", "q"), 
 #'            `2` = c("k", "h", "d", "e", "i"), 
 #'            `3` = c("f", "r", "w", "y", "s", "t", "c", "n"))
-#' calc_ed(aa1, aa2) 
+#' calc_ed(aa1, aa2, measure = "pi") 
 #'     
 #' # the encoding distance between two identical encodings is 0
-#' calc_ed(aa1, aa1) 
-#'  
+#' calc_ed(aa1, aa1, measure = "pi") 
 
-calc_ed <- function(a, b, prop = NULL) {
+calc_ed <- function(a, b, prop = NULL, measure) {
+  ed_function <- switch(measure, 
+                        si = calc_si_hidden,
+                        pi = calc_pi_hidden)
+
+  if(!is.null(prop)) {
+    if(!is.matrix(prop))
+      stop("'prop' must have 'matrix' class.")
+
+    if(!(ncol(prop) %in% c(4, 20)))
+      stop("'prop' must have 4 or 20 columns.")
+  }
+
   # compare temporary a to temporary b
   if(length(a) < length(b)) {
     warning("'a' must be longer than 'b'. Reverting a and b.") 
@@ -65,13 +72,77 @@ calc_ed <- function(a, b, prop = NULL) {
   if(!validate_encoding(ta, unlist(tb)))
     stop("Encodings ('a' and 'b') must contain the same elements.")
 
+  ed <- ed_function(ta, tb)
+
   if(!is.null(prop)) {
-    if(!is.matrix(prop))
-      stop("'prop' must have 'matrix' class.")
+    coords_a <- lapply(a, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
+    coords_b <- lapply(b, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
 
-    if(!(ncol(prop) %in% c(4, 20)))
-      stop("'prop' must have 4 or 20 columns.")
+    norm_factor <- sum(sapply(coords_a, function(single_coords_a) {
+      distances <- sapply(coords_b, function(single_coords_b) 
+        # vector of distances between groups
+        sqrt(sum((single_coords_a - single_coords_b)^2))
+      )
+      # c(dist = min(distances), id = unname(which.min(distances)))
+      min(distances)
+    }))
+
+    ed <- ed * norm_factor
   }
+
+  unname(ed)
+}
+
+
+#' Calculate partition index
+#' 
+#' Computes the encoding distance between two encodings.
+#' @param a encoding (see \code{\link{validate_encoding}} for more information 
+#' about the required structure of encoding).
+#' @param b encoding to which \code{a} should be compared. Must have equal number 
+#' of groups or less than \code{a}. Both \code{a} and {b} must have the the same 
+#' number of elements.
+#' @details The encoding distance between \code{a} and \code{b} is defined as the 
+#' minimum number of amino acids that have to be moved between subgroups of encoding 
+#' to make \code{a} identical to \code{b} (order of subgroups in the encoding and amino 
+#' acids in a group is unimportant).
+#' 
+#' If the parameter \code{prop} is supplied, the encoding distance is normalized by the 
+#' factor equal to the sum of distances for each group in \code{a} and the closest group 
+#' in \code{b}. The position of a group is defined as the mean value of properties of 
+#' amino acids or nucleotides belonging the group.
+#' 
+#' See the package vignette for more details.
+#' @seealso 
+#' \code{\link{calc_si}}: compute the similarity index of two encodings.
+#' \code{\link{encoding2df}}: converts an encoding to a data frame.
+#' \code{\link{validate_encoding}}: validate a structure of an encoding.
+#' @return an encoding distance.
+#' @importFrom partitions listParts
+#' @importFrom combinat permn
+#' @export
+#' @examples
+#' # calculate encoding distance between two encodings of amino acids
+#' aa1 = list(`1` = c("g", "a", "p", "v", "m", "l", "i"), 
+#'            `2` = c("k", "h"), 
+#'            `3` = c("d", "e"), 
+#'            `4` = c("f", "r", "w", "y", "s", "t", "c", "n", "q"))
+#' 
+#' aa2 = list(`1` = c("g", "a", "p", "v", "m", "l", "q"), 
+#'            `2` = c("k", "h", "d", "e", "i"), 
+#'            `3` = c("f", "r", "w", "y", "s", "t", "c", "n"))
+#' calc_pi(aa1, aa2) 
+#'     
+#' # the encoding distance between two identical encodings is 0
+#' calc_pi(aa1, aa1) 
+#'  
+
+calc_pi <- function(a, b) {
+  calc_ed(a, b, measure = "pi")
+}
+
+calc_pi_hidden <- function(ta, tb) {
+
   # encoding distance - distance between encodings
   ed <- 0
 
@@ -102,7 +173,7 @@ calc_ed <- function(a, b, prop = NULL) {
     } else {
 
       merges <- create_merges(len_a, len_b)
-
+      
       # list of merged
       reduced_size <- lapply(merges, function(single_merge) {
         ed <- sum(sapply(single_merge, function(i) {
@@ -128,23 +199,7 @@ calc_ed <- function(a, b, prop = NULL) {
     calc_ed_single(ta, tb)
   }
 
-  if(!is.null(prop)) {
-    coords_a <- lapply(a, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
-    coords_b <- lapply(b, function(single_subgroup) rowMeans(prop[, single_subgroup, drop = FALSE]))
-
-    norm_factor <- sum(sapply(coords_a, function(single_coords_a) {
-      distances <- sapply(coords_b, function(single_coords_b) 
-        # vector of distances between groups
-        sqrt(sum((single_coords_a - single_coords_b)^2))
-      )
-      # c(dist = min(distances), id = unname(which.min(distances)))
-      min(distances)
-    }))
-
-    ed <- ed * norm_factor
-  }
-
-  unname(ed)
+  ed
 }
 
 calc_ed_single <- function(ta, tb, ed = 0) {
@@ -169,90 +224,16 @@ calc_ed_single <- function(ta, tb, ed = 0) {
   }))
 }
 
-
-#' Validate encoding
-#'
-#' Checks the structure of an encoding.
-#' @param x encoding.
-#' @param u \code{integer}, \code{numeric} or \code{character} vector of all
-#' elements belonging to the encoding. See Details.
-#' @keywords manip
-#' @return \code{TRUE} if the \code{x} is a correctly reduced \code{u}, 
-#' \code{FALSE} in any other cases.
-#' @details The encoding is a list of groups to which elements of an alphabet 
-#' should be reduced. All elements of the alphabet (all 
-#' amino acids or all nucleotides) should appear in the encoding.
-#' @export
-#' @keywords manip
-#' @seealso 
-#' \code{\link{calc_ed}}: calculate the encoding distance between two encodings.
-#' \code{\link{encoding2df}}: converts an encoding to a data frame.
-#' @examples
-#' enc1 = list(`1` = c("a", "t"), 
-#'             `2` = c("g", "c"))
-#' # see if enc1 is the correctly reduced nucleotide (DNA) alphabet
-#' validate_encoding(enc1, c("a", "c", "g", "t"))
-#' 
-#' # enc1 is not the RNA alphabet, so the results is FALSE
-#' validate_encoding(enc1, c("a", "c", "g", "u"))
-#' 
-#' # validate_encoding works also on other notations
-#' enc2 = list(a = c(1, 4),
-#'             b = c(2, 3))
-#' validate_encoding(enc2, 1L:4)
-
-validate_encoding <- function(x, u) {
-  if(!is.list(x))
-    stop("'x' must have 'list' class.")
-  all(sort(unlist(x)) == sort(u))
-}
-
-
-
 create_merges <- function(x, n_groups) {
   all_merges <- listParts(x)
   # all partitions of x of the proper length
   chosen_merges <- all_merges[lengths(all_merges) == n_groups]
-
+  
   # all permutations of a single partition
   perms <- permn(n_groups)
 
   unlist(lapply(chosen_merges, function(single_merge) {
-      lapply(perms, function(single_perm) 
-        unname(single_merge)[single_perm])
-    }), recursive = FALSE)
-}
-
-#' Convert encoding to data frame
-#'
-#' Converts an encoding to a data frame.
-#' @param x encoding.
-#' @param sort if \code{TRUE} rows are sorted according to elements.
-#' @keywords manip
-#' @return data frame with two columns. First column represents an index of a 
-#' group in the supplied encoding and the second column contains all elements of 
-#' the encoding.
-#' @details The encoding is a list of groups to which elements of an alphabet 
-#' should be reduced. All elements of the alphabet (all 
-#' amino acids or all nucleotides) should appear in the encoding.
-#' @export
-#' @keywords manip
-#' @seealso 
-#' \code{\link{calc_ed}}: calculate the encoding distance between two encodings.
-#' \code{\link{validate_encoding}}: validate a structure of an encoding.
-#' @examples
-#' enc1 = list(`1` = c("a", "t"), 
-#'             `2` = c("g", "c"))
-#' encoding2df(enc1)
-
-
-encoding2df <- function(x, sort = FALSE) {
-  res <- do.call(rbind, lapply(1L:length(x), function(gr_id) {
-    data.frame(gr_id = gr_id, element = x[[gr_id]])
-  }))
-
-  if(sort)
-    res <- res[order(levels(res[["element"]])), ]
-
-  res
+    lapply(perms, function(single_perm) 
+      unname(single_merge)[single_perm])
+  }), recursive = FALSE)
 }
diff --git a/R/calc_si.R b/R/calc_si.R
@@ -30,9 +30,10 @@
 #' calc_si(enc1, enc3)
 
 calc_si <- function(a, b) {
-  if(!validate_encoding(a, unlist(b)))
-    stop("Encodings ('a' and 'b') must contain the same elements.")
-
+  calc_ed(a, b, measure = "si", prop = NULL)
+}
+
+calc_si_hidden <- function(a, b) {
   # similarity matrix
   comp <- encoding2matrix(a) == encoding2matrix(b)
   diag(comp) <- 0