Contest Link: https://datahack.analyticsvidhya.com/contest/black-friday/

Problem Statement

A retail company ???ABC Private Limited??? wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

Solution

The dataset can be downloaded from contest link.

train.df <- read.csv("train.csv", header = TRUE)
test.df <- read.csv("test.csv", header = TRUE) 
dim(train.df)
[1] 550068     12
dim(test.df)
[1] 233599     11
any(is.na(train.df))
[1] TRUE
names(which(sapply(train.df, anyNA)))
[1] "Product_Category_2" "Product_Category_3"
summary(train.df)
    User_ID            Product_ID     Gender        Age           Occupation     City_Category
 Min.   :1000001   P00265242:  1880   F:135809   0-17 : 15102   Min.   : 0.000   A:147720     
 1st Qu.:1001516   P00025442:  1615   M:414259   18-25: 99660   1st Qu.: 2.000   B:231173     
 Median :1003077   P00110742:  1612              26-35:219587   Median : 7.000   C:171175     
 Mean   :1003029   P00112142:  1562              36-45:110013   Mean   : 8.077                
 3rd Qu.:1004478   P00057642:  1470              46-50: 45701   3rd Qu.:14.000                
 Max.   :1006040   P00184942:  1440              51-55: 38501   Max.   :20.000                
                   (Other)  :540489              55+  : 21504                                 
 Stay_In_Current_City_Years Marital_Status   Product_Category_1 Product_Category_2
 0 : 74398                  Min.   :0.0000   Min.   : 1.000     Min.   : 2.00     
 1 :193821                  1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 5.00     
 2 :101838                  Median :0.0000   Median : 5.000     Median : 9.00     
 3 : 95285                  Mean   :0.4097   Mean   : 5.404     Mean   : 9.84     
 4+: 84726                  3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:15.00     
                            Max.   :1.0000   Max.   :20.000     Max.   :18.00     
                                                                NA's   :173638    
 Product_Category_3    Purchase    
 Min.   : 3.0       Min.   :   12  
 1st Qu.: 9.0       1st Qu.: 5823  
 Median :14.0       Median : 8047  
 Mean   :12.7       Mean   : 9264  
 3rd Qu.:16.0       3rd Qu.:12054  
 Max.   :18.0       Max.   :23961  
 NA's   :383247                    
summary(test.df)
    User_ID            Product_ID     Gender        Age          Occupation     City_Category
 Min.   :1000001   P00265242:   829   F: 57827   0-17 : 6232   Min.   : 0.000   A:62524      
 1st Qu.:1001527   P00112142:   717   M:175772   18-25:42293   1st Qu.: 2.000   B:98566      
 Median :1003070   P00025442:   695              26-35:93428   Median : 7.000   C:72509      
 Mean   :1003029   P00110742:   680              36-45:46711   Mean   : 8.085                
 3rd Qu.:1004477   P00046742:   646              46-50:19577   3rd Qu.:14.000                
 Max.   :1006040   P00184942:   626              51-55:16283   Max.   :20.000                
                   (Other)  :229406              55+  : 9075                                 
 Stay_In_Current_City_Years Marital_Status   Product_Category_1 Product_Category_2
 0 :31318                   Min.   :0.0000   Min.   : 1.000     Min.   : 2.00     
 1 :82604                   1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 5.00     
 2 :43589                   Median :0.0000   Median : 5.000     Median : 9.00     
 3 :40143                   Mean   :0.4101   Mean   : 5.277     Mean   : 9.85     
 4+:35945                   3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:15.00     
                            Max.   :1.0000   Max.   :18.000     Max.   :18.00     
                                                                NA's   :72344     
 Product_Category_3
 Min.   : 3.00     
 1st Qu.: 9.00     
 Median :14.00     
 Mean   :12.67     
 3rd Qu.:16.00     
 Max.   :18.00     
 NA's   :162562    
train.df[10][is.na(train.df[10])] <- 20
train.df[11][is.na(train.df[11])] <- 20
test.df[10][is.na(test.df[10])] <- 20
test.df[11][is.na(test.df[11])] <- 20
sum(is.na(train.df))
[1] 0
sum(is.na(test.df))
[1] 0
train$data <- 1
test$Purchase <- 0
test$data <- 0
total <- rbind(train,test)
for (i in 1:11)
{
  total[,i] <- as.factor(total[,i])
}
train.notmar <- total[total$Marital_Status == 0 ,] 
train.notmar <- train.notmar[train.notmar$data == 1,]
test.notmar <- total[total$data == 0 ,]
test.notmar$Purchase <- NULL
test.notmar$data <- NULL
train.notmar$data <- NULL
library(rpart)
model <- rpart(Purchase ~ .,data = train.notmar)
pred_tree <- predict(model, test.notmar)
submit <- data.frame(User_ID = test$User_ID,
                     Product_ID = test$Product_ID,
                     Purchase = pred_tree)
library(xgboost)
for (i in 1:12)
{
  train.notmar[,i] <-  as.numeric(train.notmar[,i])
}
for (i in 1:11)
{
  test.notmar[,i] <-  as.numeric(test.notmar[,i])
}
X_features <- c( "User_ID" , "Product_ID" , "Gender" ,                   
                 "Age" , "Occupation" ,  "City_Category" ,           
                 "Stay_In_Current_City_Years" , "Product_Category_1" ,       
                 "Product_Category_2" , "Product_Category_3")
X_target <- train.notmar$Purchase
xgtrain <- xgb.DMatrix(data <- as.matrix(train.notmar[, X_features]), label = X_target, missing = NA)
xgtest <- xgb.DMatrix(data <- as.matrix(test.notmar[, X_features]), missing = NA)
params <- list()
params$objective <- "reg:linear"
params$eta <- 0.23
params$max_depth <- 10
params$subsample <- 1
params$colsample_bytree <- 1
params$min_child_weight <- 2
params$eval_metric <- "rmse"
model_xgb <- xgb.train(params <- params, xgtrain, nrounds <- 100)
vimp <- xgb.importance(model <- model_xgb, feature_names = X_features)
pred_boost <- predict(model_xgb, xgtest)
submit$Purchase_boosted <- pred_boost 
Final_submit <- submit
Final_submit<-Final_submit[,-c(4)]
Final_submit$Purchase_1 <- (submit$Purchase + 2*submit$Purchase_boosted)/3
write.csv(Final_submit[,-c(3,4)], "result.csv", row.names = FALSE)
LS0tCnRpdGxlOiAiQmxhY2stRnJpZGF5IFNhbGVzIFByZWRpY3Rpb24iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCj4gQ29udGVzdCBMaW5rOiBodHRwczovL2RhdGFoYWNrLmFuYWx5dGljc3ZpZGh5YS5jb20vY29udGVzdC9ibGFjay1mcmlkYXkvCgojIyMgUHJvYmxlbSBTdGF0ZW1lbnQKCkEgcmV0YWlsIGNvbXBhbnkgPz8/QUJDIFByaXZhdGUgTGltaXRlZD8/PyB3YW50cyB0byB1bmRlcnN0YW5kIHRoZSBjdXN0b21lciBwdXJjaGFzZSBiZWhhdmlvdXIgKHNwZWNpZmljYWxseSwgcHVyY2hhc2UgYW1vdW50KSBhZ2FpbnN0IHZhcmlvdXMgcHJvZHVjdHMgb2YgZGlmZmVyZW50IGNhdGVnb3JpZXMuIFRoZXkgaGF2ZSBzaGFyZWQgcHVyY2hhc2Ugc3VtbWFyeSBvZiB2YXJpb3VzIGN1c3RvbWVycyBmb3Igc2VsZWN0ZWQgaGlnaCB2b2x1bWUgcHJvZHVjdHMgZnJvbSBsYXN0IG1vbnRoLgpUaGUgZGF0YSBzZXQgYWxzbyBjb250YWlucyBjdXN0b21lciBkZW1vZ3JhcGhpY3MgKGFnZSwgZ2VuZGVyLCBtYXJpdGFsIHN0YXR1cywgY2l0eV90eXBlLCBzdGF5X2luX2N1cnJlbnRfY2l0eSksIHByb2R1Y3QgZGV0YWlscyAocHJvZHVjdF9pZCBhbmQgcHJvZHVjdCBjYXRlZ29yeSkgYW5kIFRvdGFsIHB1cmNoYXNlX2Ftb3VudCBmcm9tIGxhc3QgbW9udGguCgpOb3csIHRoZXkgd2FudCB0byBidWlsZCBhIG1vZGVsIHRvIHByZWRpY3QgdGhlIHB1cmNoYXNlIGFtb3VudCBvZiBjdXN0b21lciBhZ2FpbnN0IHZhcmlvdXMgcHJvZHVjdHMgd2hpY2ggd2lsbCBoZWxwIHRoZW0gdG8gY3JlYXRlIHBlcnNvbmFsaXplZCBvZmZlciBmb3IgY3VzdG9tZXJzIGFnYWluc3QgZGlmZmVyZW50IHByb2R1Y3RzLgoKIyMjIFNvbHV0aW9uCgpUaGUgZGF0YXNldCBjYW4gYmUgZG93bmxvYWRlZCBmcm9tIFtjb250ZXN0IGxpbmtdKGh0dHBzOi8vZGF0YWhhY2suYW5hbHl0aWNzdmlkaHlhLmNvbS9jb250ZXN0L2JsYWNrLWZyaWRheS8pLiAKCi0gTGV0cyBsb2FkIHRoZSB0cmFpbiBhbmQgdGVzdCBkYXRhc2V0IGludG8gdGhlIGRhdGFmcmFtZS4gQXMgd2Ugc2VlLCBoZWFkZXIgaXMgcHJlc2VudCBpbiB0aGUgZGF0YXNldCwgbGV0cyBwYXNzIHdpdGggdGhlIGFyZ3VtZW50LCBgaGVhZGVyPVRSVUVgIHRvIHNraXAgdGhlIGZpcnN0IHJvdy4KCmBgYHtyfQp0cmFpbi5kZiA8LSByZWFkLmNzdigidHJhaW4uY3N2IiwgaGVhZGVyID0gVFJVRSkKdGVzdC5kZiA8LSByZWFkLmNzdigidGVzdC5jc3YiLCBoZWFkZXIgPSBUUlVFKSAKYGBgCgotIFN0YXJ0aW5nIHdpdGggcHJpbGltaW5hcnkgYW5hbHlzaXMsIHdlIGZpcnN0IGNoZWNrIHRoZSBkaW1lbnNpb24gb2YgdGhlIGRhdGEKCmBgYHtyfQpkaW0odHJhaW4uZGYpCmRpbSh0ZXN0LmRmKQpgYGAKCi0gU2Vjb25kbHksIHdlIHJ1biB0aGUgYmVsb3cgdG8gY2hlY2sgZm9yIG1pc3NpbmcgdmFsdWVzLiBNaXNzaW5nIHZhbHVlcyBjYW4gY2F1c2UgcHJvYmxlbXMgaW4gdGhlIHRyYWluaW5nIHBoYXNlIGlmIG5vdCB0YWtlbiBjYXJlIG9mIGJlZm9yZS4KCmBgYHtyfQphbnkoaXMubmEodHJhaW4uZGYpKQpgYGAKCi0gQ2hlY2tpbmcgd2hpY2ggY29sdW1ucywgdGhlIG1pc3NpbmcgdmFsdWVzIGJlbG9uZyB0bwoKYGBge3J9Cm5hbWVzKHdoaWNoKHNhcHBseSh0cmFpbi5kZiwgYW55TkEpKSkKYGBgCgotIExldHMgc2VlIHRoZSBzdW1tYXJ5IG9mIGVhY2ggZGF0YWZyYW1lcyB0byBnZXQgdGhlIGNvdW50IG9mIHRoZSBtaXNzaW5nIHZhbHVlcyBpbiB0aGUgdHJhaW5pbmcgYW5kIHRlc3Qgc2V0LiBTdW1tYXJ5IHByb3ZpZGVzIGFuIG92ZXJhbGwgaW5zaWdodCBpbnRvIHRoZSBkYXRhLgoKYGBge3J9CnN1bW1hcnkodHJhaW4uZGYpCnN1bW1hcnkodGVzdC5kZikKYGBgCgotIEltcHV0aW5nIHRoZSBtaXNzaW5nIHZhbHVlIHdpdGggMjAgaW4gdGhlIHRyYWluaW5nIGRhdGEKCmBgYHtyfQp0cmFpbi5kZlsxMF1baXMubmEodHJhaW4uZGZbMTBdKV0gPC0gMjAKdHJhaW4uZGZbMTFdW2lzLm5hKHRyYWluLmRmWzExXSldIDwtIDIwCmBgYAoKLSBJbXB1dGluZyB0aGUgbWlzc2luZyB2YWx1ZSB3aXRoIDIwLCBpbiB0aGUgdGVzdGluZyBkYXRhCgpgYGB7cn0KdGVzdC5kZlsxMF1baXMubmEodGVzdC5kZlsxMF0pXSA8LSAyMAp0ZXN0LmRmWzExXVtpcy5uYSh0ZXN0LmRmWzExXSldIDwtIDIwCmBgYAoKLSBDaGVja2luZyB0aGUgdHJhaW5pbmcgYW5kIHRlc3RpbmcgZGF0YSB0byBtYWtlIHN1cmUgdGhlcmUgYXJlIG5vIG1vcmUgbWlzc2luZyB2YWx1ZXMKCmBgYHtyfQpzdW0oaXMubmEodHJhaW4uZGYpKQpzdW0oaXMubmEodGVzdC5kZikpCmBgYAoKCmBgYHtyfQp0cmFpbiRkYXRhIDwtIDEKdGVzdCRQdXJjaGFzZSA8LSAwCnRlc3QkZGF0YSA8LSAwCmBgYAoKYGBge3J9CnRvdGFsIDwtIHJiaW5kKHRyYWluLHRlc3QpCmBgYAoKYGBge3J9CmZvciAoaSBpbiAxOjExKQp7CiAgdG90YWxbLGldIDwtIGFzLmZhY3Rvcih0b3RhbFssaV0pCn0KYGBgCgpgYGB7cn0KdHJhaW4ubm90bWFyIDwtIHRvdGFsW3RvdGFsJE1hcml0YWxfU3RhdHVzID09IDAgLF0gCnRyYWluLm5vdG1hciA8LSB0cmFpbi5ub3RtYXJbdHJhaW4ubm90bWFyJGRhdGEgPT0gMSxdCnRlc3Qubm90bWFyIDwtIHRvdGFsW3RvdGFsJGRhdGEgPT0gMCAsXQpgYGAKCmBgYHtyfQp0ZXN0Lm5vdG1hciRQdXJjaGFzZSA8LSBOVUxMCnRlc3Qubm90bWFyJGRhdGEgPC0gTlVMTAp0cmFpbi5ub3RtYXIkZGF0YSA8LSBOVUxMCmBgYAoKLSBEZWNpc2lvbiBUcmVlCgpgYGB7cn0KbGlicmFyeShycGFydCkKYGBgCgpgYGB7cn0KbW9kZWwgPC0gcnBhcnQoUHVyY2hhc2UgfiAuLGRhdGEgPSB0cmFpbi5ub3RtYXIpCnByZWRfdHJlZSA8LSBwcmVkaWN0KG1vZGVsLCB0ZXN0Lm5vdG1hcikKYGBgCgpgYGB7cn0Kc3VibWl0IDwtIGRhdGEuZnJhbWUoVXNlcl9JRCA9IHRlc3QkVXNlcl9JRCwKICAgICAgICAgICAgICAgICAgICAgUHJvZHVjdF9JRCA9IHRlc3QkUHJvZHVjdF9JRCwKICAgICAgICAgICAgICAgICAgICAgUHVyY2hhc2UgPSBwcmVkX3RyZWUpCmBgYAoKLSBYR2Jvb3N0CgpgYGB7cn0KbGlicmFyeSh4Z2Jvb3N0KQpgYGAKCmBgYHtyfQpmb3IgKGkgaW4gMToxMikKewogIHRyYWluLm5vdG1hclssaV0gPC0gIGFzLm51bWVyaWModHJhaW4ubm90bWFyWyxpXSkKfQpgYGAKCmBgYHtyfQpmb3IgKGkgaW4gMToxMSkKewogIHRlc3Qubm90bWFyWyxpXSA8LSAgYXMubnVtZXJpYyh0ZXN0Lm5vdG1hclssaV0pCn0KCmBgYAoKLSBGZWF0dXJlcwoKYGBge3J9ClhfZmVhdHVyZXMgPC0gYyggIlVzZXJfSUQiICwgIlByb2R1Y3RfSUQiICwgIkdlbmRlciIgLCAgICAgICAgICAgICAgICAgICAKICAgICAgICAgICAgICAgICAiQWdlIiAsICJPY2N1cGF0aW9uIiAsICAiQ2l0eV9DYXRlZ29yeSIgLCAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgIlN0YXlfSW5fQ3VycmVudF9DaXR5X1llYXJzIiAsICJQcm9kdWN0X0NhdGVnb3J5XzEiICwgICAgICAgCiAgICAgICAgICAgICAgICAgIlByb2R1Y3RfQ2F0ZWdvcnlfMiIgLCAiUHJvZHVjdF9DYXRlZ29yeV8zIikKYGBgCgpgYGB7cn0KWF90YXJnZXQgPC0gdHJhaW4ubm90bWFyJFB1cmNoYXNlCmBgYAoKYGBge3J9CnhndHJhaW4gPC0geGdiLkRNYXRyaXgoZGF0YSA8LSBhcy5tYXRyaXgodHJhaW4ubm90bWFyWywgWF9mZWF0dXJlc10pLCBsYWJlbCA9IFhfdGFyZ2V0LCBtaXNzaW5nID0gTkEpCnhndGVzdCA8LSB4Z2IuRE1hdHJpeChkYXRhIDwtIGFzLm1hdHJpeCh0ZXN0Lm5vdG1hclssIFhfZmVhdHVyZXNdKSwgbWlzc2luZyA9IE5BKQpgYGAKCi0gU2V0dGluZyBQYXJhbWV0ZXJzCgpgYGB7cn0KcGFyYW1zIDwtIGxpc3QoKQpwYXJhbXMkb2JqZWN0aXZlIDwtICJyZWc6bGluZWFyIgpwYXJhbXMkZXRhIDwtIDAuMjMKcGFyYW1zJG1heF9kZXB0aCA8LSAxMApwYXJhbXMkc3Vic2FtcGxlIDwtIDEKcGFyYW1zJGNvbHNhbXBsZV9ieXRyZWUgPC0gMQpwYXJhbXMkbWluX2NoaWxkX3dlaWdodCA8LSAyCnBhcmFtcyRldmFsX21ldHJpYyA8LSAicm1zZSIKYGBgCgotIGJ1aWxkaW5nIG1vZGVsCgpgYGB7cn0KbW9kZWxfeGdiIDwtIHhnYi50cmFpbihwYXJhbXMgPC0gcGFyYW1zLCB4Z3RyYWluLCBucm91bmRzIDwtIDEwMCkKYGBgCgotIGNoZWNraW5nIGltcG9ydGFudCBGZWF0dXJlcwoKYGBge3J9CnZpbXAgPC0geGdiLmltcG9ydGFuY2UobW9kZWwgPC0gbW9kZWxfeGdiLCBmZWF0dXJlX25hbWVzID0gWF9mZWF0dXJlcykKYGBgCgotIFByZWRpY3RpbmcKCmBgYHtyfQpwcmVkX2Jvb3N0IDwtIHByZWRpY3QobW9kZWxfeGdiLCB4Z3Rlc3QpCmBgYAoKLSBTdWJtaXNzaW9uCgpgYGB7cn0Kc3VibWl0JFB1cmNoYXNlX2Jvb3N0ZWQgPC0gcHJlZF9ib29zdCAKRmluYWxfc3VibWl0IDwtIHN1Ym1pdApgYGAKCi0gV2VpZ2h0ZWQgQXZlcmFnZSBvZiBEZWNpc2lvbiB0cmVlIGFuZCBCb29zdGluZwoKYGBge3J9CkZpbmFsX3N1Ym1pdDwtRmluYWxfc3VibWl0WywtYyg0KV0KRmluYWxfc3VibWl0JFB1cmNoYXNlXzEgPC0gKHN1Ym1pdCRQdXJjaGFzZSArIDIqc3VibWl0JFB1cmNoYXNlX2Jvb3N0ZWQpLzMKd3JpdGUuY3N2KEZpbmFsX3N1Ym1pdFssLWMoMyw0KV0sICJyZXN1bHQuY3N2Iiwgcm93Lm5hbWVzID0gRkFMU0UpCmBgYA==