R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(hrbrthemes)

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

Lets get some summary statistics and initial exploration

#Battery
# In the unique values of Battery we can see a few rows that have negative values
# As Batterys cannot be negative we can tell that these values need to be deleted 
# Since they are incorrect.
sum(is.na(data["Battery"])) # 677 na values in Battery, should be deleted.

## [1] 677

data <- na.omit(data)
battery <- as.numeric(unlist(data["Battery"]))
min(data$Battery)

## [1] -9

max(data$Battery)

## [1] 98

data<-data[(data$Battery > 0),]
hist(battery, col = "purple")

# data["Battery"]<- data["Battery">0,"Battery"]

sum(is.na(data$Battery))

## [1] 0

head(data)

##           HarvestTime BikeID Battery BikeIdentifier BikeTypeName EBikeProfileID
## 1 2020-09-23 17:00:02      5      90              1  DUB-General              1
## 2 2020-09-23 17:00:02      6      46              2  DUB-General              1
## 4 2020-09-23 17:00:02      9      68              5  DUB-General              1
## 5 2020-09-23 17:00:02     12      61              8  DUB-General              1
## 6 2020-09-23 17:00:02     14      77             11  DUB-General              1
## 7 2020-09-23 17:00:02     15      41             13  DUB-General              1
##   EBikeStateID         LastGPSTime     LastRentalStart Latitude Longitude
## 1            2 2020-09-23 17:46:16 2020-09-23 15:34:58  53.3334  -6.24431
## 2            2 2020-09-23 17:44:16 2020-09-23 17:35:42  53.3414  -6.28357
## 4            2 2020-09-23 17:51:29 2020-09-22 07:48:40  53.3252  -6.25521
## 5            2 2020-09-23 17:56:10 2020-09-23 17:43:14  53.3090  -6.21631
## 6            2 2020-09-23 17:55:13 2020-09-23 15:47:16  53.3382  -6.22257
## 7            2 2020-09-23 17:40:16 2020-09-22 06:01:22  53.2981  -6.16029

# BikeTypeName

sum(data[, "BikeTypeName"]=="DUB-General")

## [1] 25397

sum(data[, "BikeTypeName"]=="Private")

## [1] 350

sum(data[, "BikeTypeName"]=="Workshop")

## [1] 629

#BikeType <- count(data$"BikeTypeName")


BikeType <-  data.frame(data$BikeTypeName)

barplot(prop.table(table(BikeType)), col = "light Blue")

#EBikeProfileID

sum(data[, "EBikeProfileID"]==4)

## [1] 415

sum(data[, "EBikeProfileID"]==1)

## [1] 25961

Bikeprofile <-  data.frame(data$EBikeProfileID)
barplot(prop.table(table(Bikeprofile)), col = "light green")

EBikeStateID

sum(data[, "EBikeStateID"]==2)

## [1] 26157

sum(data[, "EBikeStateID"]==5)

## [1] 154

sum(data[, "EBikeStateID"]==1)

## [1] 65

BikeState <-  data.frame(data$EBikeStateID)

bikestat_15 <- BikeState[BikeState==1 | BikeState == 5]
barplot(prop.table(table(BikeState)), col = "pink")

barplot(prop.table(table(bikestat_15)), col = "dark red")

# Indicates: {1:'Warning - is in move and not rented',2:'Normal',3:'Switched Off',4:'Firmware Upgrade',5:'Laying on the ground'}


# Seeing that this is a categorical variable, we will change its data type to reflect this. 
BikeState <- as.factor(BikeState)

#Latitude, Longitude

min(data["Latitude"])

## [1] 0

max(data["Longitude"])

## [1] 0

# There is one row that has 0 for both values which is incorrect. It does not logically fill the value  with means and medians since it is a location.
data <- data["Latitude">0]
data<-data[(data$Latitude != 0),]
min(data$Latitude)

## [1] 53.2798

dim(data)

## [1] 26032    11

#26032 * 11 : matrix dimension has reduced

data_sorted <- data[order(data$BikeID , data$HarvestTime),]

summary(data_sorted)

##  HarvestTime            BikeID          Battery      BikeIdentifier  
##  Length:26032       Min.   :  5.00   Min.   : 1.00   Min.   :  1.00  
##  Class :character   1st Qu.: 33.00   1st Qu.:45.00   1st Qu.: 29.00  
##  Mode  :character   Median : 59.00   Median :68.00   Median : 55.00  
##                     Mean   : 59.87   Mean   :62.41   Mean   : 56.33  
##                     3rd Qu.: 86.00   3rd Qu.:81.00   3rd Qu.: 83.00  
##                     Max.   :124.00   Max.   :98.00   Max.   :120.00  
##  BikeTypeName       EBikeProfileID   EBikeStateID   LastGPSTime       
##  Length:26032       Min.   :1.000   Min.   :1.000   Length:26032      
##  Class :character   1st Qu.:1.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median :1.000   Median :2.000   Mode  :character  
##                     Mean   :1.047   Mean   :2.016                     
##                     3rd Qu.:1.000   3rd Qu.:2.000                     
##                     Max.   :4.000   Max.   :5.000                     
##  LastRentalStart       Latitude       Longitude     
##  Length:26032       Min.   :53.28   Min.   :-6.363  
##  Class :character   1st Qu.:53.32   1st Qu.:-6.275  
##  Mode  :character   Median :53.34   Median :-6.258  
##                     Mean   :53.34   Mean   :-6.249  
##                     3rd Qu.:53.35   3rd Qu.:-6.232  
##                     Max.   :53.39   Max.   :-6.114

fwrite(data_sorted,"\\Users\\matenezamaninia\\Documents\\ANLY_501\\Project\\ Data_Cleaning\\Record_Cleaning_in_R\\sorted_in_R.csv", row.names = FALSE)

RecordCleaning

Matene Zamaninia

9/29/2021

R Markdown

Lets get some summary statistics and initial exploration

EBikeStateID