开发者

R make.unique starting in 1

开发者 https://www.devze.com 2023-04-11 01:51 出处:网络
I have a data frame with columns that are in groups of 4 like so: a b c d a b c d a b c d a b c d... Then, I use the function rep to create tags for the columns:

I have a data frame with columns that are in groups of 4 like so:

a b c d a b c d a b c d a b c d...

Then, I use the function rep to create tags for the columns:

rep(c("a", "b", "c", "d"), len=ncol)

Finally I use the function make.unique to create the tags:

a b c d a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3...

However, I would like to get:

开发者_如何学Ca1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 a4 b4 c4 d4...

Is there an easy way to accomplish this? In the make.unique documentation does not mention any parameters to obtain this behaviour.


Here is a further variant. Applying the function make.unique.2 by @adn.bps can still produces some duplicates:

> u = c("a", "a", "b", "c", "c", "d", "c", "a.1")
> make.unique.2(u)
[1] "a.1" "a.2" "b"   "c.1" "c.2" "d"   "c.3" "a.1"

To avoid that, I've done:

dotify <- function(x, avoid){
  l <- length(x)
  if(l == 1L){
    return(x)
  }
  numbers <- 1L:l
  out <- paste0(x, ".", numbers)
  ndots <- 1L
  while(any(out %in% avoid)){
    ndots <- ndots + 1L
    out <- paste0(x, paste0(rep(".", ndots), collapse = ""), numbers)
  }
  out
}

make.unique2 <- function(x){
  if(anyDuplicated(x)){
    splt <- split(x, x)
    u <- names(splt)
    for(i in 1L:length(splt)){
      splt_i <- splt[[i]]
      j <- match(splt_i[1L], u)
      avoid <- u[-j]
      splt_i_new <- dotify(splt_i, avoid)
      u <- c(avoid, splt_i_new)
      splt[[i]] <- splt_i_new
    }
    x <- unsplit(splt, x)
  }
  x
}

make.unique2(u)
# [1] "a..1" "a..2" "b"    "c.1"  "c.2"  "d"    "c.3"  "a.1" 


make.unique.2 = function(x, sep='.'){
    ave(x, x, FUN=function(a){if(length(a) > 1){paste(a, 1:length(a), sep=sep)} else {a}})
}

Testing against your example:

> u = rep(c("a", "b", "c", "d"), 4)
> make.unique.2(u)
  [1] "a.1" "b.1" "c.1" "d.1" "a.2" "b.2" "c.2" "d.2" "a.3" "b.3" "c.3" "d.3"
 [13] "a.4" "b.4" "c.4" "d.4"

If an element is not duplicated, it is left alone:

> u = c('a', 'a', 'b', 'c', 'c', 'c', 'd')
> make.unique.2(u)
[1] "a.1" "a.2" "b"   "c.1" "c.2" "c.3" "d"


Wouldn't call this pretty, but it does the job:

> ncol <- 10
> apply(expand.grid(c("a","b","c","d"),1:((ncol+3)/4)), 1,
+   function(x)paste(x,collapse=""))[1:ncol]
 [1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2" "a3" "b3"

where ncol is the number of tags to generate.


n <- 4
ncol <- 16
paste(letters[seq(n)], rep(seq(ncol/n), each = n, len = ncol), sep = "")


TLDR

# install.packages('makeunique')
library(makeunique)

# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)

Full answer

Disclaimer: I'm the author of the makeunique package

@Stéphane Laurent's answer is great! But I thought it might help future users to provide a similar but slightly more easily customisable version that I packaged up for my particular use case. Functionally the code below differs from @Stephane's answer in that It throws an informative error if the appended de-duplicating numbers lead to creation of an element that was already in your starting vector. You can then customise the separator or change whether numbers are wrapped in brackets to eliminate the problem. This lets you create unique datasets with more consistent, prettier suffixes

Option 1 (makeunique package)

library(makeunique)

# Simple use case
simple_input <- c("a", "b", "c", "d", "a", "b", "c", "d")
make_unique(simple_input, sep = "", wrap_in_brackets = FALSE)
#> [1] "a1" "b1" "c1" "d1" "a2" "b2" "c2" "d2"


# A harder case which @Stephane highlighted
# make_unique will throw an error instead of automatically fixing, so the user can choose how to resolve
difficult_input <- c("a", "b", "c", "d", "a", "b", "c", "d", "d2")
make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE)
#> Error in make_unique(difficult_input, sep = "", wrap_in_brackets = FALSE): make_unique failed to make vector unique.
#> This is because appending '  <dup_number>' to duplicate values led tocreation of term(s) that were in the original dataset: 
#> [d2]
#> 
#> Please try again with a different argument for either `wrap_in_brackets` or `sep`

# Fix by using '-' as a separator
make_unique(difficult_input, sep = "-", wrap_in_brackets = FALSE)
#> [1] "a-1" "b-1" "c-1" "d-1" "a-2" "b-2" "c-2" "d-2" "d2"

Created on 2022-10-14 by the reprex package (v2.0.1)

Option 2: Use a custom function

Instead of using the package, feel free to just use the source function in your own code

make_unique <- function(x, sep = " ", wrap_in_brackets = TRUE, warn_about_type_conversion = TRUE){
  if(!(is.character(sep) & length(sep) == 1)) stop('`sep` must be a string, not a ', paste0(class(sep), collapse = " "))
  if(!(is.logical(wrap_in_brackets) & length(wrap_in_brackets) == 1)) stop('`wrap_in_brackets` must be a flag, not a ', paste0(class(wrap_in_brackets), collapse = " "))
  if(!(is.logical(warn_about_type_conversion) & length(warn_about_type_conversion) == 1)) stop('`warn_about_type_conversion` must be a flag, not a ', paste0(class(warn_about_type_conversion), collapse = " "))
  if(!any(is.numeric(x),is.character(x),is.factor(x))) stop('input to `make_unique` must be a character, numeric, or factor variable')

  if(is.factor(x)) {
    if(warn_about_type_conversion) warning('make_unique: Converting factor to character variable')
    x <- as.character(x)
  }
  else if(is.numeric(x)) {
    if(warn_about_type_conversion) warning('make_unique: Converting numeric variable to a character vector')
    x <- as.character(x)
  }

  deduplicated = stats::ave(x, x, FUN = function(a){
    if(length(a) > 1){
      suffixes <- seq_along(a)
      if(wrap_in_brackets) suffixes <- paste0('(', suffixes, ')')
      paste0(a, sep, suffixes)
    }
    else {a}
  })

  values_still_duplicated <- deduplicated[duplicated(deduplicated)]

  if(length(stats::na.omit(values_still_duplicated)) > 0){
    stop(
      "make_unique failed to make vector unique.\n",
      "This is because appending '  <dup_number>' to duplicate values led to",
      "creation of term(s) that were in the original dataset: \n[",
      paste0(values_still_duplicated, collapse = ', '),
      "]\n\nPlease try again with a different argument for either `wrap_in_brackets` or `sep`"
    )
  }

  return(deduplicated)
}
0

精彩评论

暂无评论...
验证码 换一张
取 消