This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
#Battery
# In the unique values of Battery we can see a few rows that have negative values
# As Batterys cannot be negative we can tell that these values need to be deleted
# Since they are incorrect.
sum(is.na(data["Battery"])) # 677 na values in Battery, should be deleted.
## [1] 677
data <- na.omit(data)
battery <- as.numeric(unlist(data["Battery"]))
min(data$Battery)
## [1] -9
max(data$Battery)
## [1] 98
data<-data[(data$Battery > 0),]
hist(battery, col = "purple")
# data["Battery"]<- data["Battery">0,"Battery"]
sum(is.na(data$Battery))
## [1] 0
head(data)
## HarvestTime BikeID Battery BikeIdentifier BikeTypeName EBikeProfileID
## 1 2020-09-23 17:00:02 5 90 1 DUB-General 1
## 2 2020-09-23 17:00:02 6 46 2 DUB-General 1
## 4 2020-09-23 17:00:02 9 68 5 DUB-General 1
## 5 2020-09-23 17:00:02 12 61 8 DUB-General 1
## 6 2020-09-23 17:00:02 14 77 11 DUB-General 1
## 7 2020-09-23 17:00:02 15 41 13 DUB-General 1
## EBikeStateID LastGPSTime LastRentalStart Latitude Longitude
## 1 2 2020-09-23 17:46:16 2020-09-23 15:34:58 53.3334 -6.24431
## 2 2 2020-09-23 17:44:16 2020-09-23 17:35:42 53.3414 -6.28357
## 4 2 2020-09-23 17:51:29 2020-09-22 07:48:40 53.3252 -6.25521
## 5 2 2020-09-23 17:56:10 2020-09-23 17:43:14 53.3090 -6.21631
## 6 2 2020-09-23 17:55:13 2020-09-23 15:47:16 53.3382 -6.22257
## 7 2 2020-09-23 17:40:16 2020-09-22 06:01:22 53.2981 -6.16029
# BikeTypeName
sum(data[, "BikeTypeName"]=="DUB-General")
## [1] 25397
sum(data[, "BikeTypeName"]=="Private")
## [1] 350
sum(data[, "BikeTypeName"]=="Workshop")
## [1] 629
#BikeType <- count(data$"BikeTypeName")
BikeType <- data.frame(data$BikeTypeName)
barplot(prop.table(table(BikeType)), col = "light Blue")
#EBikeProfileID
sum(data[, "EBikeProfileID"]==4)
## [1] 415
sum(data[, "EBikeProfileID"]==1)
## [1] 25961
Bikeprofile <- data.frame(data$EBikeProfileID)
barplot(prop.table(table(Bikeprofile)), col = "light green")
sum(data[, "EBikeStateID"]==2)
## [1] 26157
sum(data[, "EBikeStateID"]==5)
## [1] 154
sum(data[, "EBikeStateID"]==1)
## [1] 65
BikeState <- data.frame(data$EBikeStateID)
bikestat_15 <- BikeState[BikeState==1 | BikeState == 5]
barplot(prop.table(table(BikeState)), col = "pink")
barplot(prop.table(table(bikestat_15)), col = "dark red")
# Indicates: {1:'Warning - is in move and not rented',2:'Normal',3:'Switched Off',4:'Firmware Upgrade',5:'Laying on the ground'}
# Seeing that this is a categorical variable, we will change its data type to reflect this.
BikeState <- as.factor(BikeState)
#Latitude, Longitude
min(data["Latitude"])
## [1] 0
max(data["Longitude"])
## [1] 0
# There is one row that has 0 for both values which is incorrect. It does not logically fill the value with means and medians since it is a location.
data <- data["Latitude">0]
data<-data[(data$Latitude != 0),]
min(data$Latitude)
## [1] 53.2798
dim(data)
## [1] 26032 11
#26032 * 11 : matrix dimension has reduced
data_sorted <- data[order(data$BikeID , data$HarvestTime),]
summary(data_sorted)
## HarvestTime BikeID Battery BikeIdentifier
## Length:26032 Min. : 5.00 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.: 33.00 1st Qu.:45.00 1st Qu.: 29.00
## Mode :character Median : 59.00 Median :68.00 Median : 55.00
## Mean : 59.87 Mean :62.41 Mean : 56.33
## 3rd Qu.: 86.00 3rd Qu.:81.00 3rd Qu.: 83.00
## Max. :124.00 Max. :98.00 Max. :120.00
## BikeTypeName EBikeProfileID EBikeStateID LastGPSTime
## Length:26032 Min. :1.000 Min. :1.000 Length:26032
## Class :character 1st Qu.:1.000 1st Qu.:2.000 Class :character
## Mode :character Median :1.000 Median :2.000 Mode :character
## Mean :1.047 Mean :2.016
## 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :4.000 Max. :5.000
## LastRentalStart Latitude Longitude
## Length:26032 Min. :53.28 Min. :-6.363
## Class :character 1st Qu.:53.32 1st Qu.:-6.275
## Mode :character Median :53.34 Median :-6.258
## Mean :53.34 Mean :-6.249
## 3rd Qu.:53.35 3rd Qu.:-6.232
## Max. :53.39 Max. :-6.114
fwrite(data_sorted,"\\Users\\matenezamaninia\\Documents\\ANLY_501\\Project\\ Data_Cleaning\\Record_Cleaning_in_R\\sorted_in_R.csv", row.names = FALSE)