##This case study aims to analyze the bike share data provided by Cyclistic, a bike-share program in Chicago. With over 5,800 bicycles and 600 docking stations, Cyclistic offers various types of bikes, including reclining bikes, hand tricycles, and cargo bikes, making it inclusive for people with disabilities and those who cannot use a standard two-wheeled bike. While the majority of riders opt for traditional bikes, approximately 8% of users utilize assistive options. Cyclistic users predominantly ride for leisure, but around 30% use the service for daily work commutes.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(lubridate)
library(janitor)
## Warning: package 'janitor' was built under R version 4.3.1
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
tripdata <- na.omit(tripdata)
tripdata <- distinct(tripdata)
tripdata$ended_at <- as.POSIXct(tripdata$ended_at, format = "%Y-%m-%d %H:%M:%S")
tripdata$started_at <- as.POSIXct(tripdata$started_at, format = "%Y-%m-%d %H:%M:%S")
tripdata$date <- as.Date(tripdata$started_at)
tripdata$month <- format(as.Date(tripdata$date), "%m")
tripdata$day <- format(as.Date(tripdata$date), "%d")
tripdata$year <- format(as.Date(tripdata$date), "%Y")
tripdata$day_of_week <- format(as.Date(tripdata$date), "%A")
rows_to_replace <- tripdata$started_at >= tripdata$ended_at
tripdata$started_at[rows_to_replace] <- tripdata$ended_at[rows_to_replace]
tripdata$ride_length <- as.numeric(difftime(tripdata$ended_at, tripdata$started_at, units = "mins"))
tripdata_2 <- tripdata[tripdata$ride_length > 0, ]
tripdata_2$day_of_week <- ordered(tripdata_2$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
sum(is.na(tripdata_2$ride_length))
## [1] 0
mean(tripdata_2$ride_length, na.rm = TRUE)
## [1] 15.57099
max(tripdata_2$ride_length, na.rm = TRUE)
## [1] 32035.45
aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = mean)
## tripdata_2$member_casual tripdata_2$ride_length
## 1 casual 20.78343
## 2 member 12.15135
aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = median)
## tripdata_2$member_casual tripdata_2$ride_length
## 1 casual 12.216667
## 2 member 8.616667
aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = max)
## tripdata_2$member_casual tripdata_2$ride_length
## 1 casual 32035.450
## 2 member 1499.933
aggregate(tripdata_2$ride_length~tripdata_2$member_casual+ tripdata_2$day_of_week, FUN = mean)
## tripdata_2$member_casual tripdata_2$day_of_week tripdata_2$ride_length
## 1 casual Sunday 23.73271
## 2 member Sunday 13.58787
## 3 casual Monday 21.18258
## 4 member Monday 11.61263
## 5 casual Tuesday 18.75404
## 6 member Tuesday 11.61798
## 7 casual Wednesday 18.09999
## 8 member Wednesday 11.61681
## 9 casual Thursday 18.51233
## 10 member Thursday 11.75105
## 11 casual Friday 19.73261
## 12 member Friday 11.96011
## 13 casual Saturday 23.15303
## 14 member Saturday 13.40474
aggregate(tripdata_2$ride_length~tripdata_2$member_casual+ tripdata_2$day_of_week, FUN = max)
## tripdata_2$member_casual tripdata_2$day_of_week tripdata_2$ride_length
## 1 casual Sunday 10807.217
## 2 member Sunday 1499.933
## 3 casual Monday 32035.450
## 4 member Monday 1499.917
## 5 casual Tuesday 12136.300
## 6 member Tuesday 1480.650
## 7 casual Wednesday 3245.783
## 8 member Wednesday 1499.867
## 9 casual Thursday 2349.350
## 10 member Thursday 1466.283
## 11 casual Friday 9962.350
## 12 member Friday 1499.933
## 13 casual Saturday 4848.350
## 14 member Saturday 1499.917
#Visualizations
tripdata_2 %>%
group_by(member_casual) %>%
summarize(number_of_rides = n()) %>%
arrange(member_casual) %>%
ggplot(aes(x = member_casual, y = number_of_rides, fill = member_casual))+
labs(title = "Number of trips by customer type") +
geom_col(width= 0.5, position = position_dodge(width= 0.5)) +
scale_y_continuous(labels=function(x) format(x, scientific= FALSE))+
geom_text(aes(label= number_of_rides), vjust= -0.5)
#Average ride length by customer type
tripdata_2 %>%
group_by(member_casual) %>%
summarise(average_ride_length = round(mean(ride_length), 3)) %>%
ggplot(aes(x = member_casual, y = average_ride_length, fill = member_casual)) +
labs(title = "Average ride length") +
geom_col(width = 0.5, position = position_dodge(width = 0.5)) +
geom_text(aes(label = average_ride_length), vjust = -0.5)
# Total trips by customer type by day of the week
tripdata_2 %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n()) %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual))+
labs(title = "Total trips by customer type Vs Day of the week")+
theme(axis.text.x = element_text(angle = 25))+
geom_col(width = 0.5, position = position_dodge(width = 0.5))+
scale_y_continuous(labels= function(x) format(x, scientific = FALSE))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
#geom_text (aes(label = number_of_rides), vjust = -0.5) - labels all the numbers inside the chart
tripdata_2 %>%
group_by(member_casual, day_of_week) %>%
summarise(average_ride_length = mean(ride_length)) %>%
ggplot(aes(x = day_of_week, y = average_ride_length, fill = member_casual))+
labs(title = "Average ride length by customer type Vs Day of the week")+
theme(axis.text.x = element_text(angle = 25))+
geom_col(width = 0.5, position = position_dodge(width = 0.5))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
#Total rides by month
tripdata_2 %>%
group_by(member_casual, month) %>%
summarize(number_of_rides = n()) %>%
arrange(member_casual, month) %>%
ggplot(aes(x= month, y = number_of_rides, fill = member_casual))+
labs(title = "Total rides by customer type per month")+
theme(axis.text.x = element_text(angle = 25))+
geom_col(width = 0.5, postion = position_dodge(width = 0.5))+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Warning in geom_col(width = 0.5, postion = position_dodge(width = 0.5)):
## Ignoring unknown parameters: `postion`
#geom_text(aes(label = number_of_rides), vjust = 0.5) - inside labels
tripdata_2 %>%
group_by(member_casual, month) %>%
summarize(ride_length = n()) %>%
arrange(member_casual, month) %>%
ggplot(aes(x= month, y = ride_length, fill = member_casual))+
labs(title = "Total ride lengths by customer type per month")+
theme(axis.text.x = element_text(angle = 25))+
geom_col(width = 0.5, postion = position_dodge(width = 0.5))+
scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## Warning in geom_col(width = 0.5, postion = position_dodge(width = 0.5)):
## Ignoring unknown parameters: `postion`