skip to Main Content

I have a data frame that looks something like this:

df <- as.data.frame(matrix(c("True Organic", "True Organic", NA, NA, NA, 0,
         "True Organic", "True Organic", NA, NA, NA, 0,
         "Organic Search (SEO)", "Induced Organic", NA, NA, NA, 0,
         "Display", NA, NA, NA, NA, 0,
         "Social Ads (Act)", "Induced Organic", "Induced Organic", NA, NA, 1,
         "Referral", "Social Ads (Act)", NA, NA, NA, 0,
         "Special Emails", "Induced Organic", NA, NA, NA, 1,
         "Daily Email", "Daily Email", "Daily Email", NA, NA, 0), nrow = 8,
         ncol = 6, byrow = TRUE, dimnames = list(NULL, c("Node_1", "Node_2",
                                                         "Node_3", "Node_4",
                                                         "Node_5", "conversion"))), ,
         stringsAsFactors = FALSE)


df

                Node_1           Node_2          Node_3 Node_4 Node_5 conversion
1         True Organic     True Organic            <NA>   <NA>   <NA>          0
2         True Organic     True Organic            <NA>   <NA>   <NA>          0
3 Organic Search (SEO)  Induced Organic            <NA>   <NA>   <NA>          0
4              Display             <NA>            <NA>   <NA>   <NA>          0
5     Social Ads (Act)  Induced Organic Induced Organic   <NA>   <NA>          1
6             Referral Social Ads (Act)            <NA>   <NA>   <NA>          0
7       Special Emails  Induced Organic            <NA>   <NA>   <NA>          1
8          Daily Email      Daily Email     Daily Email   <NA>   <NA>          0

For each row, I want to replace all NAs with either the string “Null” if the conversion column for that row is equal to 0, or with “Conversion” if the conversion column is equal to 1.

My final output should look like this.

df_desired <- as.data.frame(matrix(c("True Organic", "True Organic", "Null", "Null", "Null", 0,
                                 "True Organic", "True Organic", "Null", "Null", "Null", 0,
                                 "Organic Search (SEO)", "Induced Organic", "Null", "Null", "Null", 0,
                                 "Display", "Null", "Null", "Null", "Null", 0,
                                 "Social Ads (Act)", "Induced Organic", "Induced Organic", "Conversion", "Conversion", 1,
                                 "Referral", "Social Ads (Act)","Null", "Null", "Null", 0,
                                 "Special Emails", "Induced Organic", "Converison", "Conversion", "Conversion", 1,
                                 "Daily Email", "Daily Email", "Daily Email", "Null", "Null", 0), nrow = 8,
                               ncol = 6, byrow = TRUE, dimnames = list(NULL, c("Node_1", "Node_2",
                                                                               "Node_3", "Node_4",
                                                                               "Node_5", "conversion"))), ,
                        stringsAsFactors = FALSE)


df_desired 


               Node_1           Node_2          Node_3     Node_4     Node_5 conversion
1         True Organic     True Organic            Null       Null       Null          0
2         True Organic     True Organic            Null       Null       Null          0
3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
4              Display             Null            Null       Null       Null          0
5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
6             Referral Social Ads (Act)            Null       Null       Null          0
7       Special Emails  Induced Organic      Converison Conversion Conversion          1
8          Daily Email      Daily Email     Daily Email       Null       Null          0  

I am able to do this with a nested for loop.

  for (i in 1:nrow(df)){
  for (j in 1:ncol(df)){
    df[i,j] <- ifelse(((is.na(df[i,j])) & df[i,]$conversion == "1"), "Conversion", df[i,j]) 
    for (j in 1:ncol(df)){
      df[i,j] <- ifelse(((is.na(df[i,j])) & df[i,]$conversion == "0"), "Null", df[i,j])
    }   
  }
}

Unfortunately, this does not scale well. There must be a better way to do this. Any suggestions would be greatly appreciated. Thanks in advance!

3

Answers


  1. Let’s write a little function that does this to one vector (inputting both vectors):

    foo = function(x, conversion) {
        x = ifelse(!is.na(x), x, ifelse(conversion == 1, "Conversion", "Null"))
    }
    

    Then a simple loop to do it to all columns except conversion:

    for (col in setdiff(names(df), "conversion")) {
        df[[col]] = foo(df[[col]], df$conversion)
    }
    
    df
    #                 Node_1           Node_2          Node_3     Node_4     Node_5 conversion
    # 1         True Organic     True Organic            Null       Null       Null          0
    # 2         True Organic     True Organic            Null       Null       Null          0
    # 3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
    # 4              Display             Null            Null       Null       Null          0
    # 5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
    # 6             Referral Social Ads (Act)            Null       Null       Null          0
    # 7       Special Emails  Induced Organic      Conversion Conversion Conversion          1
    # 8          Daily Email      Daily Email     Daily Email       Null       Null          0
    

    Here’s a slightly optimized version. This might save you a few more seconds on 1M+ rows.

    foo_x = function(x, conversion) {
        x_na = is.na(x)
        conversion_1 = conversion == 1
        x[x_na & conversion_1] = "Conversion"
        x[x_na & !conversion_1] = "Null"
        return(x)
    }
    
    Login or Signup to reply.
  2. Another alternative using dplyr:

    library(dplyr)
    df <- df %>% 
      mutate_all(funs(case_when(
        is.na(.) & conversion == 0 ~ "Null", 
        is.na(.) & conversion == 1 ~ "conversion", 
        TRUE ~ .
    )))
    
    Login or Signup to reply.
  3. Do it with one indexed assignment, relying on the row index of each NA value to subset the df$conversion column:

    df[is.na(df)] <- c("Null","Conversion")[as.numeric(df$conversion)+1][row(df)[is.na(df)]]
    
    #                Node_1           Node_2          Node_3     Node_4     Node_5 conversion
    #1         True Organic     True Organic            Null       Null       Null          0
    #2         True Organic     True Organic            Null       Null       Null          0
    #3 Organic Search (SEO)  Induced Organic            Null       Null       Null          0
    #4              Display             Null            Null       Null       Null          0
    #5     Social Ads (Act)  Induced Organic Induced Organic Conversion Conversion          1
    #6             Referral Social Ads (Act)            Null       Null       Null          0
    #7       Special Emails  Induced Organic      Conversion Conversion Conversion          1
    #8          Daily Email      Daily Email     Daily Email       Null       Null          0
    

    It should be fast to execute. Here’s 1.5M rows and 115 columns processed in 4.2 seconds.

    df <- df[sample(1:8,1.5e6,replace=TRUE),c(sample(1:5,115,replace=TRUE),6)]
    dim(df)
    #[1] 1500000     116
    system.time({
      df[is.na(df)] <- c("Null","Conversion")[as.numeric(df$conversion)+1][row(df)[is.na(df)]]
    })
    #   user  system elapsed 
    #   2.59    1.61    4.20 
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search