##This case study aims to analyze the bike share data provided by Cyclistic, a bike-share program in Chicago. With over 5,800 bicycles and 600 docking stations, Cyclistic offers various types of bikes, including reclining bikes, hand tricycles, and cargo bikes, making it inclusive for people with disabilities and those who cannot use a standard two-wheeled bike. While the majority of riders opt for traditional bikes, approximately 8% of users utilize assistive options. Cyclistic users predominantly ride for leisure, but around 30% use the service for daily work commutes.

Installing and loading the necessary packages

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.1

## Warning: package 'ggplot2' was built under R version 4.3.1

## Warning: package 'dplyr' was built under R version 4.3.1

## Warning: package 'lubridate' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(dplyr)
library(lubridate)
library(janitor)

## Warning: package 'janitor' was built under R version 4.3.1

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

importing the year long data of bikeshare

may23 <- read.csv("202305-divvy-tripdata.csv")
apr23 <- read.csv("202304-divvy-tripdata.csv")
mar23 <- read.csv("202303-divvy-tripdata.csv")
feb23 <- read.csv("202302-divvy-tripdata.csv")
jan23 <- read.csv("202301-divvy-tripdata.csv")
dec22 <- read.csv("202212-divvy-tripdata.csv")
nov22 <- read.csv("202211-divvy-tripdata.csv")
oct22 <- read.csv("202210-divvy-tripdata.csv")
sep22 <- read.csv("202209-divvy-tripdata.csv")
aug22 <- read.csv("202208-divvy-tripdata.csv")
jul22 <- read.csv("202207-divvy-tripdata.csv")
jun22 <- read.csv("202206-divvy-tripdata.csv")

importing the year-long data of bikeshare

tripdata <- rbind(may23, apr23, mar23, feb23, jan23, dec22, nov22, oct22, sep22, aug22, jul22, jun22)

Clean up and remove duplicates

tripdata <- na.omit(tripdata)
tripdata <- distinct(tripdata)

Convert to POSIXct format

tripdata$ended_at <- as.POSIXct(tripdata$ended_at, format = "%Y-%m-%d %H:%M:%S")
tripdata$started_at <- as.POSIXct(tripdata$started_at, format = "%Y-%m-%d %H:%M:%S")

Adding columns for date, month, day, year, day of the week, and ride length

tripdata$date <- as.Date(tripdata$started_at)
tripdata$month <- format(as.Date(tripdata$date), "%m")
tripdata$day <- format(as.Date(tripdata$date), "%d")
tripdata$year <- format(as.Date(tripdata$date), "%Y")
tripdata$day_of_week <- format(as.Date(tripdata$date), "%A")

Identify rows where started_at is not less than ended_at

rows_to_replace <- tripdata$started_at >= tripdata$ended_at

Replace the values in started_at with the corresponding values in ended_at

tripdata$started_at[rows_to_replace] <- tripdata$ended_at[rows_to_replace]

Calculate ride_length in minutes

tripdata$ride_length <- as.numeric(difftime(tripdata$ended_at, tripdata$started_at, units = "mins"))

Removing ride_length <= 0

tripdata_2 <- tripdata[tripdata$ride_length > 0, ]

Order the day of the week for clean visualization

tripdata_2$day_of_week <- ordered(tripdata_2$day_of_week, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

sum(is.na(tripdata_2$ride_length))

## [1] 0

Basic calculations

mean(tripdata_2$ride_length, na.rm = TRUE)

## [1] 15.57099

max(tripdata_2$ride_length, na.rm = TRUE)

## [1] 32035.45

Length of ride by member type mean calculation

aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = mean)

##   tripdata_2$member_casual tripdata_2$ride_length
## 1                   casual               20.78343
## 2                   member               12.15135

Length of ride by member type median

aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = median)

##   tripdata_2$member_casual tripdata_2$ride_length
## 1                   casual              12.216667
## 2                   member               8.616667

Length of ride by member type max

aggregate(tripdata_2$ride_length~tripdata_2$member_casual, FUN = max)

##   tripdata_2$member_casual tripdata_2$ride_length
## 1                   casual              32035.450
## 2                   member               1499.933

mean length of ride by member type by day of week

aggregate(tripdata_2$ride_length~tripdata_2$member_casual+ tripdata_2$day_of_week, FUN = mean)

##    tripdata_2$member_casual tripdata_2$day_of_week tripdata_2$ride_length
## 1                    casual                 Sunday               23.73271
## 2                    member                 Sunday               13.58787
## 3                    casual                 Monday               21.18258
## 4                    member                 Monday               11.61263
## 5                    casual                Tuesday               18.75404
## 6                    member                Tuesday               11.61798
## 7                    casual              Wednesday               18.09999
## 8                    member              Wednesday               11.61681
## 9                    casual               Thursday               18.51233
## 10                   member               Thursday               11.75105
## 11                   casual                 Friday               19.73261
## 12                   member                 Friday               11.96011
## 13                   casual               Saturday               23.15303
## 14                   member               Saturday               13.40474

maximum length of ride by member type by day of week

aggregate(tripdata_2$ride_length~tripdata_2$member_casual+ tripdata_2$day_of_week, FUN = max)

##    tripdata_2$member_casual tripdata_2$day_of_week tripdata_2$ride_length
## 1                    casual                 Sunday              10807.217
## 2                    member                 Sunday               1499.933
## 3                    casual                 Monday              32035.450
## 4                    member                 Monday               1499.917
## 5                    casual                Tuesday              12136.300
## 6                    member                Tuesday               1480.650
## 7                    casual              Wednesday               3245.783
## 8                    member              Wednesday               1499.867
## 9                    casual               Thursday               2349.350
## 10                   member               Thursday               1466.283
## 11                   casual                 Friday               9962.350
## 12                   member                 Friday               1499.933
## 13                   casual               Saturday               4848.350
## 14                   member               Saturday               1499.917

#Visualizations

Total trips by customer type

tripdata_2 %>% 
  group_by(member_casual) %>% 
  summarize(number_of_rides = n()) %>% 
  arrange(member_casual) %>% 
  ggplot(aes(x = member_casual, y = number_of_rides, fill = member_casual))+ 
  labs(title = "Number of trips by customer type") + 
  geom_col(width= 0.5, position = position_dodge(width= 0.5)) +
  scale_y_continuous(labels=function(x) format(x, scientific= FALSE))+ 
  geom_text(aes(label= number_of_rides), vjust= -0.5)

#Average ride length by customer type

tripdata_2 %>% 
  group_by(member_casual) %>% 
  summarise(average_ride_length = round(mean(ride_length), 3)) %>% 
  ggplot(aes(x = member_casual, y = average_ride_length, fill = member_casual)) +
  labs(title = "Average ride length") +
  geom_col(width = 0.5, position = position_dodge(width = 0.5)) +
  geom_text(aes(label = average_ride_length), vjust = -0.5)

# Total trips by customer type by day of the week

tripdata_2 %>%
  group_by(member_casual, day_of_week) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(member_casual, day_of_week) %>% 
  ggplot(aes(x = day_of_week, y = number_of_rides, fill = member_casual))+
  labs(title = "Total trips by customer type Vs Day of the week")+
  theme(axis.text.x = element_text(angle = 25))+
  geom_col(width = 0.5, position = position_dodge(width = 0.5))+
  scale_y_continuous(labels= function(x) format(x, scientific = FALSE))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

  #geom_text (aes(label = number_of_rides), vjust = -0.5) - labels all the numbers inside the chart

Average trips by customer type by day of the week

tripdata_2 %>%
  group_by(member_casual, day_of_week) %>% 
  summarise(average_ride_length = mean(ride_length)) %>% 
  ggplot(aes(x = day_of_week, y = average_ride_length, fill = member_casual))+
  labs(title = "Average ride length by customer type Vs Day of the week")+
  theme(axis.text.x = element_text(angle = 25))+
  geom_col(width = 0.5, position = position_dodge(width = 0.5))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

#Total rides by month

tripdata_2 %>% 
  group_by(member_casual, month) %>% 
  summarize(number_of_rides = n()) %>% 
  arrange(member_casual, month) %>% 
  ggplot(aes(x= month, y = number_of_rides, fill = member_casual))+
  labs(title = "Total rides by customer type per month")+
  theme(axis.text.x = element_text(angle = 25))+
  geom_col(width = 0.5, postion = position_dodge(width = 0.5))+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## Warning in geom_col(width = 0.5, postion = position_dodge(width = 0.5)):
## Ignoring unknown parameters: `postion`

  #geom_text(aes(label = number_of_rides), vjust = 0.5) - inside labels

Total ride lengths by month

tripdata_2 %>% 
  group_by(member_casual, month) %>% 
  summarize(ride_length = n()) %>% 
  arrange(member_casual, month) %>% 
  ggplot(aes(x= month, y = ride_length, fill = member_casual))+
  labs(title = "Total ride lengths by customer type per month")+
  theme(axis.text.x = element_text(angle = 25))+
  geom_col(width = 0.5, postion = position_dodge(width = 0.5))+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## Warning in geom_col(width = 0.5, postion = position_dodge(width = 0.5)):
## Ignoring unknown parameters: `postion`

Analyzing Cyclistic Bike Share Data and Designing a Marketing Strategy for Maximizing Annual Memberships

Padmashree Ravikiran

2023-06-30