Du Pham
paste("First created on July 15, 2021. Updated on", Sys.Date())
Goal: Provide graphs about how casual riders and annual members use Cyclistic bikes differently so that the team can come up with appropriate stategies to maximize the number of annual memberships.
options(warn=-1)
library(tidyverse)
library(readr)
Registered S3 method overwritten by 'rvest': method from read_xml.response xml2 -- Attaching packages --------------------------------------- tidyverse 1.2.1 -- v ggplot2 3.3.5 v purrr 0.3.2 v tibble 3.1.1 v dplyr 1.0.6 v tidyr 1.1.3 v stringr 1.4.0 v readr 1.3.1 v forcats 0.4.0 -- Conflicts ------------------------------------------ tidyverse_conflicts() -- x dplyr::filter() masks stats::filter() x dplyr::lag() masks stats::lag()
mydir <- "Divvy_Trips"
myfiles <- list.files(path=mydir, pattern="*.csv", full.names=TRUE)
myfiles
for (f in myfiles){
dat <- read_csv(f)
}
Parsed with column specification: cols( trip_id = col_double(), start_time = col_datetime(format = ""), end_time = col_datetime(format = ""), bikeid = col_double(), tripduration = col_number(), from_station_id = col_double(), from_station_name = col_character(), to_station_id = col_double(), to_station_name = col_character(), usertype = col_character(), gender = col_character(), birthyear = col_double() ) Parsed with column specification: cols( `01 - Rental Details Rental ID` = col_double(), `01 - Rental Details Local Start Time` = col_datetime(format = ""), `01 - Rental Details Local End Time` = col_datetime(format = ""), `01 - Rental Details Bike ID` = col_double(), `01 - Rental Details Duration In Seconds Uncapped` = col_number(), `03 - Rental Start Station ID` = col_double(), `03 - Rental Start Station Name` = col_character(), `02 - Rental End Station ID` = col_double(), `02 - Rental End Station Name` = col_character(), `User Type` = col_character(), `Member Gender` = col_character(), `05 - Member Details Member Birthday Year` = col_double() ) Parsed with column specification: cols( trip_id = col_double(), start_time = col_datetime(format = ""), end_time = col_datetime(format = ""), bikeid = col_double(), tripduration = col_number(), from_station_id = col_double(), from_station_name = col_character(), to_station_id = col_double(), to_station_name = col_character(), usertype = col_character(), gender = col_character(), birthyear = col_double() ) Parsed with column specification: cols( trip_id = col_double(), start_time = col_datetime(format = ""), end_time = col_datetime(format = ""), bikeid = col_double(), tripduration = col_number(), from_station_id = col_double(), from_station_name = col_character(), to_station_id = col_double(), to_station_name = col_character(), usertype = col_character(), gender = col_character(), birthyear = col_double() )
glimpse(dat)
Rows: 704,054 Columns: 12 $ trip_id <dbl> 25223640, 25223641, 25223642, 25223643, 25223644, 25~ $ start_time <dttm> 2019-10-01 00:01:39, 2019-10-01 00:02:16, 2019-10-0~ $ end_time <dttm> 2019-10-01 00:17:20, 2019-10-01 00:06:34, 2019-10-0~ $ bikeid <dbl> 2215, 6328, 3003, 3275, 5294, 1891, 1061, 1274, 6011~ $ tripduration <dbl> 940, 258, 850, 2350, 1867, 373, 1072, 1458, 1437, 83~ $ from_station_id <dbl> 20, 19, 84, 313, 210, 156, 84, 156, 156, 336, 77, 19~ $ from_station_name <chr> "Sheffield Ave & Kingsbury St", "Throop (Loomis) St ~ $ to_station_id <dbl> 309, 241, 199, 290, 382, 226, 142, 463, 463, 336, 50~ $ to_station_name <chr> "Leavitt St & Armitage Ave", "Morgan St & Polk St", ~ $ usertype <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe~ $ gender <chr> "Male", "Male", "Female", "Male", "Male", "Female", ~ $ birthyear <dbl> 1987, 1998, 1991, 1990, 1987, 1994, 1991, 1995, 1993~
head(dat)
trip_id | start_time | end_time | bikeid | tripduration | from_station_id | from_station_name | to_station_id | to_station_name | usertype | gender | birthyear |
---|---|---|---|---|---|---|---|---|---|---|---|
25223640 | 2019-10-01 00:01:39 | 2019-10-01 00:17:20 | 2215 | 940 | 20 | Sheffield Ave & Kingsbury St | 309 | Leavitt St & Armitage Ave | Subscriber | Male | 1987 |
25223641 | 2019-10-01 00:02:16 | 2019-10-01 00:06:34 | 6328 | 258 | 19 | Throop (Loomis) St & Taylor St | 241 | Morgan St & Polk St | Subscriber | Male | 1998 |
25223642 | 2019-10-01 00:04:32 | 2019-10-01 00:18:43 | 3003 | 850 | 84 | Milwaukee Ave & Grand Ave | 199 | Wabash Ave & Grand Ave | Subscriber | Female | 1991 |
25223643 | 2019-10-01 00:04:32 | 2019-10-01 00:43:43 | 3275 | 2350 | 313 | Lakeview Ave & Fullerton Pkwy | 290 | Kedzie Ave & Palmer Ct | Subscriber | Male | 1990 |
25223644 | 2019-10-01 00:04:34 | 2019-10-01 00:35:42 | 5294 | 1867 | 210 | Ashland Ave & Division St | 382 | Western Ave & Congress Pkwy | Subscriber | Male | 1987 |
25223645 | 2019-10-01 00:04:38 | 2019-10-01 00:10:51 | 1891 | 373 | 156 | Clark St & Wellington Ave | 226 | Racine Ave & Belmont Ave | Subscriber | Female | 1994 |
NA_cols <- sapply(dat, function(x) sum(is.na(x)))
NA_cols[NA_cols>0]
The missing data are in columns gender and birthyear which do not carry important numeric information for the analysis. We decide to leave the data as is.
l <- as.list(strsplit(unique(dat$usertype)," "))
# print(l)
# typeof(l)
for (name in l){
cmd <- sprintf("----------- usertype = % s -----", name)
print(eval(cmd))
d <- dat %>% filter(usertype==name)
# print(count(d, 'tripduration') )
print(summary(d[5]))
}
[1] "----------- usertype = Subscriber -----" tripduration Min. : 61 1st Qu.: 330 Median : 528 Mean : 749 3rd Qu.: 851 Max. :5169622 [1] "----------- usertype = Customer -----" tripduration Min. : 61 1st Qu.: 727 Median : 1283 Mean : 3665 3rd Qu.: 2135 Max. :8585902
We observe that the mean and median usage of the casual riders are 3,665 and 1,283, which are much higher than those of annual members, 749 and 528 resp.
dat <- dat %>%
mutate(duration_group = case_when(
# tripduration <= 60 ~ "a. <= 1h",
tripduration <= 60*8 ~ "a. <= 8h",
tripduration <= 60*24 ~ "b. 8-24h",
tripduration <= 60*168 ~ "c. 1d- 1w",
tripduration <= 60*4704 ~ "d. 1-4w",
TRUE ~ "e. > 4w"
))
dat <- dat %>%
mutate(duration_day = tripduration/(24*60))
dat %>%
filter(usertype=="Customer") %>%
group_by(duration_group) %>%
summarise_at(vars(tripduration), funs(n(),mean(., na.rm=TRUE),median(., na.rm=TRUE),sd(., na.rm=TRUE) ))
duration_group | n | mean | median | sd |
---|---|---|---|---|
a. <= 8h | 12696 | 345.8951 | 360 | 9.311473e+01 |
b. 8-24h | 46563 | 924.2205 | 903 | 2.732324e+02 |
c. 1d- 1w | 44908 | 2956.3664 | 2275 | 1.721485e+03 |
d. 1-4w | 1921 | 37481.1478 | 18453 | 4.299967e+04 |
e. > 4w | 106 | 1292997.6981 | 643713 | 1.512685e+06 |
dat %>%
filter(usertype=="Subscriber") %>%
group_by(duration_group) %>%
summarise_at(vars(tripduration), funs(n(),mean(., na.rm=TRUE),median(., na.rm=TRUE),sd(., na.rm=TRUE) ))
duration_group | n | mean | median | sd |
---|---|---|---|---|
a. <= 8h | 266973 | 308.5608 | 312.0 | 9.992137e+01 |
b. 8-24h | 283339 | 801.5121 | 740.0 | 2.505926e+02 |
c. 1d- 1w | 47095 | 1952.6450 | 1798.0 | 6.149719e+02 |
d. 1-4w | 427 | 53480.4215 | 33989.0 | 5.308332e+04 |
e. > 4w | 26 | 914154.7308 | 503003.5 | 1.250441e+06 |
ggplot(data = dat) +
# geom_histogram(mapping = aes(x = tripduration),binwidth = 1000, color = "black", fill = "gray")
geom_bar(mapping = aes(x = duration_group,fill=usertype)) +
# facet_wrap(~usertype) +
# theme(axis.text.x = element_text(angle = 45)) +
xlab("Trip duration groups") +
ylab("Number of riders") +
labs(title = "Casual Riders (Customers) vs Annual Riders (Subscribers)",subtitle ="Same scales for groups", fill= "Rider Type")
ggplot(data = dat) +
# geom_histogram(mapping = aes(x = tripduration),binwidth = 1000, color = "black", fill = "gray")
geom_bar(mapping = aes(x = duration_group,fill = usertype)) +
facet_wrap(~duration_group, scales = "free") +
scale_x_discrete(breaks= c("", "", "", "", ""))+
xlab("Trip duration groups") +
ylab("Number of riders") +
labs(title = "Casual Riders (Customers) vs Annual Riders (Subscribers)", subtitle ="Different scales for groups", fill= "Rider Type")