The Algorithms logo
算法
关于我们捐赠

标签编码

A
library(tidyverse)
#Divide data into train and test in 70% and 30%
ind<-sample(2,nrow(diamonds),replace = T,prob = c(0.7,0.3))
train.set <- diamonds[ind==1,]
test.set <- diamonds[ind==2,]

#Combine the dataset using rbind function(inbuilt function)
combi <- rbind(train.set, test.set)

##Label Encoding
combi[, cut_num := ifelse(cut == "Fair",0,
                                   ifelse(cut == "Good",1,
                                   ifelse(cut == "Very Good",2,
                                   ifelse(cut == "Premium",3,4))))]
combi[, color_num := ifelse(color == "D",0,
                                   ifelse(color == "E",2,
                                   ifelse(color == "F",3,
                                   ifelse(color == "G",4,
                                   ifelse(color == "H",5,
                                   ifelse(color == "I",6,7))))))]

# Column "clarity" won't be taken in label encoding as it contains more variables.
#The more variables in column in label encoding, the model will perform less.

#Removing categorical variables after label encoding
combi[,c("color", "cut") := NULL)
                                                     
#Divide data back into train and test in 70% and 30%
ind<-sample(2,nrow(combi),replace = T,prob = c(0.7,0.3))
train.set <- combi[ind==1,]
test.set <- combi[ind==2,]