R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Lets get some summary statistics and initial exploration

#Battery
# In the unique values of Battery we can see a few rows that have negative values
# As Batterys cannot be negative we can tell that these values need to be deleted 
# Since they are incorrect.
sum(is.na(data["Battery"])) # 677 na values in Battery, should be deleted. 
## [1] 677
data <- na.omit(data)
battery <- as.numeric(unlist(data["Battery"]))
min(data$Battery)
## [1] -9
max(data$Battery)
## [1] 98
data<-data[(data$Battery > 0),]
hist(battery, col = "purple")

# data["Battery"]<- data["Battery">0,"Battery"]

sum(is.na(data$Battery))
## [1] 0
head(data)
##           HarvestTime BikeID Battery BikeIdentifier BikeTypeName EBikeProfileID
## 1 2020-09-23 17:00:02      5      90              1  DUB-General              1
## 2 2020-09-23 17:00:02      6      46              2  DUB-General              1
## 4 2020-09-23 17:00:02      9      68              5  DUB-General              1
## 5 2020-09-23 17:00:02     12      61              8  DUB-General              1
## 6 2020-09-23 17:00:02     14      77             11  DUB-General              1
## 7 2020-09-23 17:00:02     15      41             13  DUB-General              1
##   EBikeStateID         LastGPSTime     LastRentalStart Latitude Longitude
## 1            2 2020-09-23 17:46:16 2020-09-23 15:34:58  53.3334  -6.24431
## 2            2 2020-09-23 17:44:16 2020-09-23 17:35:42  53.3414  -6.28357
## 4            2 2020-09-23 17:51:29 2020-09-22 07:48:40  53.3252  -6.25521
## 5            2 2020-09-23 17:56:10 2020-09-23 17:43:14  53.3090  -6.21631
## 6            2 2020-09-23 17:55:13 2020-09-23 15:47:16  53.3382  -6.22257
## 7            2 2020-09-23 17:40:16 2020-09-22 06:01:22  53.2981  -6.16029
# BikeTypeName

sum(data[, "BikeTypeName"]=="DUB-General")
## [1] 25397
sum(data[, "BikeTypeName"]=="Private")
## [1] 350
sum(data[, "BikeTypeName"]=="Workshop")
## [1] 629
#BikeType <- count(data$"BikeTypeName")


BikeType <-  data.frame(data$BikeTypeName)

barplot(prop.table(table(BikeType)), col = "light Blue")

#EBikeProfileID

sum(data[, "EBikeProfileID"]==4)
## [1] 415
sum(data[, "EBikeProfileID"]==1)
## [1] 25961
Bikeprofile <-  data.frame(data$EBikeProfileID)
barplot(prop.table(table(Bikeprofile)), col = "light green")

EBikeStateID

sum(data[, "EBikeStateID"]==2)
## [1] 26157
sum(data[, "EBikeStateID"]==5)
## [1] 154
sum(data[, "EBikeStateID"]==1)
## [1] 65
BikeState <-  data.frame(data$EBikeStateID)

bikestat_15 <- BikeState[BikeState==1 | BikeState == 5]
barplot(prop.table(table(BikeState)), col = "pink")

barplot(prop.table(table(bikestat_15)), col = "dark red")

# Indicates: {1:'Warning - is in move and not rented',2:'Normal',3:'Switched Off',4:'Firmware Upgrade',5:'Laying on the ground'}


# Seeing that this is a categorical variable, we will change its data type to reflect this. 
BikeState <- as.factor(BikeState)
#Latitude, Longitude

min(data["Latitude"])
## [1] 0
max(data["Longitude"])
## [1] 0
# There is one row that has 0 for both values which is incorrect. It does not logically fill the value  with means and medians since it is a location.
data <- data["Latitude">0]
data<-data[(data$Latitude != 0),]
min(data$Latitude)
## [1] 53.2798
dim(data)
## [1] 26032    11
#26032 * 11 : matrix dimension has reduced
data_sorted <- data[order(data$BikeID , data$HarvestTime),]

summary(data_sorted)
##  HarvestTime            BikeID          Battery      BikeIdentifier  
##  Length:26032       Min.   :  5.00   Min.   : 1.00   Min.   :  1.00  
##  Class :character   1st Qu.: 33.00   1st Qu.:45.00   1st Qu.: 29.00  
##  Mode  :character   Median : 59.00   Median :68.00   Median : 55.00  
##                     Mean   : 59.87   Mean   :62.41   Mean   : 56.33  
##                     3rd Qu.: 86.00   3rd Qu.:81.00   3rd Qu.: 83.00  
##                     Max.   :124.00   Max.   :98.00   Max.   :120.00  
##  BikeTypeName       EBikeProfileID   EBikeStateID   LastGPSTime       
##  Length:26032       Min.   :1.000   Min.   :1.000   Length:26032      
##  Class :character   1st Qu.:1.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median :1.000   Median :2.000   Mode  :character  
##                     Mean   :1.047   Mean   :2.016                     
##                     3rd Qu.:1.000   3rd Qu.:2.000                     
##                     Max.   :4.000   Max.   :5.000                     
##  LastRentalStart       Latitude       Longitude     
##  Length:26032       Min.   :53.28   Min.   :-6.363  
##  Class :character   1st Qu.:53.32   1st Qu.:-6.275  
##  Mode  :character   Median :53.34   Median :-6.258  
##                     Mean   :53.34   Mean   :-6.249  
##                     3rd Qu.:53.35   3rd Qu.:-6.232  
##                     Max.   :53.39   Max.   :-6.114
fwrite(data_sorted,"\\Users\\matenezamaninia\\Documents\\ANLY_501\\Project\\ Data_Cleaning\\Record_Cleaning_in_R\\sorted_in_R.csv", row.names = FALSE)