Contest Link: https://datahack.analyticsvidhya.com/contest/black-friday/
A retail company ???ABC Private Limited??? wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month. The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.
Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.
The dataset can be downloaded from contest link.
header=TRUE
to skip the first row.train.df <- read.csv("train.csv", header = TRUE)
test.df <- read.csv("test.csv", header = TRUE)
dim(train.df)
[1] 550068 12
dim(test.df)
[1] 233599 11
any(is.na(train.df))
[1] TRUE
names(which(sapply(train.df, anyNA)))
[1] "Product_Category_2" "Product_Category_3"
summary(train.df)
User_ID Product_ID Gender Age Occupation City_Category
Min. :1000001 P00265242: 1880 F:135809 0-17 : 15102 Min. : 0.000 A:147720
1st Qu.:1001516 P00025442: 1615 M:414259 18-25: 99660 1st Qu.: 2.000 B:231173
Median :1003077 P00110742: 1612 26-35:219587 Median : 7.000 C:171175
Mean :1003029 P00112142: 1562 36-45:110013 Mean : 8.077
3rd Qu.:1004478 P00057642: 1470 46-50: 45701 3rd Qu.:14.000
Max. :1006040 P00184942: 1440 51-55: 38501 Max. :20.000
(Other) :540489 55+ : 21504
Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2
0 : 74398 Min. :0.0000 Min. : 1.000 Min. : 2.00
1 :193821 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 5.00
2 :101838 Median :0.0000 Median : 5.000 Median : 9.00
3 : 95285 Mean :0.4097 Mean : 5.404 Mean : 9.84
4+: 84726 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:15.00
Max. :1.0000 Max. :20.000 Max. :18.00
NA's :173638
Product_Category_3 Purchase
Min. : 3.0 Min. : 12
1st Qu.: 9.0 1st Qu.: 5823
Median :14.0 Median : 8047
Mean :12.7 Mean : 9264
3rd Qu.:16.0 3rd Qu.:12054
Max. :18.0 Max. :23961
NA's :383247
summary(test.df)
User_ID Product_ID Gender Age Occupation City_Category
Min. :1000001 P00265242: 829 F: 57827 0-17 : 6232 Min. : 0.000 A:62524
1st Qu.:1001527 P00112142: 717 M:175772 18-25:42293 1st Qu.: 2.000 B:98566
Median :1003070 P00025442: 695 26-35:93428 Median : 7.000 C:72509
Mean :1003029 P00110742: 680 36-45:46711 Mean : 8.085
3rd Qu.:1004477 P00046742: 646 46-50:19577 3rd Qu.:14.000
Max. :1006040 P00184942: 626 51-55:16283 Max. :20.000
(Other) :229406 55+ : 9075
Stay_In_Current_City_Years Marital_Status Product_Category_1 Product_Category_2
0 :31318 Min. :0.0000 Min. : 1.000 Min. : 2.00
1 :82604 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 5.00
2 :43589 Median :0.0000 Median : 5.000 Median : 9.00
3 :40143 Mean :0.4101 Mean : 5.277 Mean : 9.85
4+:35945 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:15.00
Max. :1.0000 Max. :18.000 Max. :18.00
NA's :72344
Product_Category_3
Min. : 3.00
1st Qu.: 9.00
Median :14.00
Mean :12.67
3rd Qu.:16.00
Max. :18.00
NA's :162562
train.df[10][is.na(train.df[10])] <- 20
train.df[11][is.na(train.df[11])] <- 20
test.df[10][is.na(test.df[10])] <- 20
test.df[11][is.na(test.df[11])] <- 20
sum(is.na(train.df))
[1] 0
sum(is.na(test.df))
[1] 0
train$data <- 1
test$Purchase <- 0
test$data <- 0
total <- rbind(train,test)
for (i in 1:11)
{
total[,i] <- as.factor(total[,i])
}
train.notmar <- total[total$Marital_Status == 0 ,]
train.notmar <- train.notmar[train.notmar$data == 1,]
test.notmar <- total[total$data == 0 ,]
test.notmar$Purchase <- NULL
test.notmar$data <- NULL
train.notmar$data <- NULL
library(rpart)
model <- rpart(Purchase ~ .,data = train.notmar)
pred_tree <- predict(model, test.notmar)
submit <- data.frame(User_ID = test$User_ID,
Product_ID = test$Product_ID,
Purchase = pred_tree)
library(xgboost)
for (i in 1:12)
{
train.notmar[,i] <- as.numeric(train.notmar[,i])
}
for (i in 1:11)
{
test.notmar[,i] <- as.numeric(test.notmar[,i])
}
X_features <- c( "User_ID" , "Product_ID" , "Gender" ,
"Age" , "Occupation" , "City_Category" ,
"Stay_In_Current_City_Years" , "Product_Category_1" ,
"Product_Category_2" , "Product_Category_3")
X_target <- train.notmar$Purchase
xgtrain <- xgb.DMatrix(data <- as.matrix(train.notmar[, X_features]), label = X_target, missing = NA)
xgtest <- xgb.DMatrix(data <- as.matrix(test.notmar[, X_features]), missing = NA)
params <- list()
params$objective <- "reg:linear"
params$eta <- 0.23
params$max_depth <- 10
params$subsample <- 1
params$colsample_bytree <- 1
params$min_child_weight <- 2
params$eval_metric <- "rmse"
model_xgb <- xgb.train(params <- params, xgtrain, nrounds <- 100)
vimp <- xgb.importance(model <- model_xgb, feature_names = X_features)
pred_boost <- predict(model_xgb, xgtest)
submit$Purchase_boosted <- pred_boost
Final_submit <- submit
Final_submit<-Final_submit[,-c(4)]
Final_submit$Purchase_1 <- (submit$Purchase + 2*submit$Purchase_boosted)/3
write.csv(Final_submit[,-c(3,4)], "result.csv", row.names = FALSE)