paste("First created on July 15, 2021. Updated on", Sys.Date())


                            
                                options(warn=-1)
library(tidyverse)
library(readr)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.3.5     v purrr   0.3.2
v tibble  3.1.1     v dplyr   1.0.6
v tidyr   1.1.3     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


                            
                                mydir  <-  "Divvy_Trips"
myfiles  <- list.files(path=mydir, pattern="*.csv", full.names=TRUE)
myfiles


                            
                                for (f in myfiles){
                                dat  <-  read_csv(f)
}

Parsed with column specification:
cols(
  trip_id = col_double(),
  start_time = col_datetime(format = ""),
  end_time = col_datetime(format = ""),
  bikeid = col_double(),
  tripduration = col_number(),
  from_station_id = col_double(),
  from_station_name = col_character(),
  to_station_id = col_double(),
  to_station_name = col_character(),
  usertype = col_character(),
  gender = col_character(),
  birthyear = col_double()
)
Parsed with column specification:
cols(
  `01 - Rental Details Rental ID` = col_double(),
  `01 - Rental Details Local Start Time` = col_datetime(format = ""),
  `01 - Rental Details Local End Time` = col_datetime(format = ""),
  `01 - Rental Details Bike ID` = col_double(),
  `01 - Rental Details Duration In Seconds Uncapped` = col_number(),
  `03 - Rental Start Station ID` = col_double(),
  `03 - Rental Start Station Name` = col_character(),
  `02 - Rental End Station ID` = col_double(),
  `02 - Rental End Station Name` = col_character(),
  `User Type` = col_character(),
  `Member Gender` = col_character(),
  `05 - Member Details Member Birthday Year` = col_double()
)
Parsed with column specification:
cols(
  trip_id = col_double(),
  start_time = col_datetime(format = ""),
  end_time = col_datetime(format = ""),
  bikeid = col_double(),
  tripduration = col_number(),
  from_station_id = col_double(),
  from_station_name = col_character(),
  to_station_id = col_double(),
  to_station_name = col_character(),
  usertype = col_character(),
  gender = col_character(),
  birthyear = col_double()
)
Parsed with column specification:
cols(
  trip_id = col_double(),
  start_time = col_datetime(format = ""),
  end_time = col_datetime(format = ""),
  bikeid = col_double(),
  tripduration = col_number(),
  from_station_id = col_double(),
  from_station_name = col_character(),
  to_station_id = col_double(),
  to_station_name = col_character(),
  usertype = col_character(),
  gender = col_character(),
  birthyear = col_double()
)


                            
                                glimpse(dat)

Rows: 704,054
Columns: 12
$ trip_id           <dbl> 25223640, 25223641, 25223642, 25223643, 25223644, 25~
$ start_time        <dttm> 2019-10-01 00:01:39, 2019-10-01 00:02:16, 2019-10-0~
$ end_time          <dttm> 2019-10-01 00:17:20, 2019-10-01 00:06:34, 2019-10-0~
$ bikeid            <dbl> 2215, 6328, 3003, 3275, 5294, 1891, 1061, 1274, 6011~
$ tripduration      <dbl> 940, 258, 850, 2350, 1867, 373, 1072, 1458, 1437, 83~
$ from_station_id   <dbl> 20, 19, 84, 313, 210, 156, 84, 156, 156, 336, 77, 19~
$ from_station_name <chr> "Sheffield Ave & Kingsbury St", "Throop (Loomis) St ~
$ to_station_id     <dbl> 309, 241, 199, 290, 382, 226, 142, 463, 463, 336, 50~
$ to_station_name   <chr> "Leavitt St & Armitage Ave", "Morgan St & Polk St", ~
$ usertype          <chr> "Subscriber", "Subscriber", "Subscriber", "Subscribe~
$ gender            <chr> "Male", "Male", "Female", "Male", "Male", "Female", ~
$ birthyear         <dbl> 1987, 1998, 1991, 1990, 1987, 1994, 1991, 1995, 1993~


                            
                                head(dat)


                            
                                NA_cols <- sapply(dat, function(x) sum(is.na(x))) 
NA_cols[NA_cols>0]


                            
                                l <- as.list(strsplit(unique(dat$usertype)," "))
# print(l)
# typeof(l)
for (name in l){
                                cmd  <- sprintf("----------- usertype = % s -----", name)
                                print(eval(cmd))  
                                d <- dat  %>%     filter(usertype==name)

#   print(count(d, 'tripduration') )
                                print(summary(d[5]))

}

[1] "----------- usertype = Subscriber -----"
  tripduration    
 Min.   :     61  
 1st Qu.:    330  
 Median :    528  
 Mean   :    749  
 3rd Qu.:    851  
 Max.   :5169622  
[1] "----------- usertype = Customer -----"
  tripduration    
 Min.   :     61  
 1st Qu.:    727  
 Median :   1283  
 Mean   :   3665  
 3rd Qu.:   2135  
 Max.   :8585902


                            
                                dat <- dat %>%
                                mutate(duration_group = case_when(
#     tripduration <= 60 ~ "a. <= 1h",
                                tripduration <= 60*8 ~ "a. <= 8h",  
                                tripduration <= 60*24 ~ "b. 8-24h",
                                tripduration <= 60*168 ~ "c. 1d- 1w",  
                                tripduration <= 60*4704 ~ "d. 1-4w", 
                                TRUE ~ "e. > 4w"
                                ))

dat <- dat %>%
                                mutate(duration_day = tripduration/(24*60))


                            
                                dat %>%
                                filter(usertype=="Customer") %>%
                                group_by(duration_group) %>% 
                                summarise_at(vars(tripduration), funs(n(),mean(., na.rm=TRUE),median(., na.rm=TRUE),sd(., na.rm=TRUE) ))


                            
                                dat %>%
                                filter(usertype=="Subscriber") %>%
                                group_by(duration_group) %>% 
                                summarise_at(vars(tripduration), funs(n(),mean(., na.rm=TRUE),median(., na.rm=TRUE),sd(., na.rm=TRUE) ))


                            
                                ggplot(data = dat) + 
#   geom_histogram(mapping = aes(x = tripduration),binwidth = 1000, color = "black", fill = "gray") 
                                geom_bar(mapping = aes(x = duration_group,fill=usertype)) +
#   facet_wrap(~usertype) +
#   theme(axis.text.x = element_text(angle = 45)) +
                                xlab("Trip duration groups") +
                                ylab("Number of riders") +
                                labs(title = "Casual Riders (Customers) vs Annual Riders (Subscribers)",subtitle ="Same scales for groups", fill= "Rider Type")


                            
                                ggplot(data = dat) + 
#   geom_histogram(mapping = aes(x = tripduration),binwidth = 1000, color = "black", fill = "gray") 
                                geom_bar(mapping = aes(x = duration_group,fill = usertype)) +
                                facet_wrap(~duration_group, scales = "free") + 
                                scale_x_discrete(breaks= c("", "", "", "", ""))+
                                xlab("Trip duration groups") +
                                ylab("Number of riders") +
                                labs(title = "Casual Riders (Customers) vs Annual Riders (Subscribers)", subtitle ="Different scales for groups", fill= "Rider Type")

duration_group	n	mean	median	sd
a. <= 8h	12696	345.8951	360	9.311473e+01
b. 8-24h	46563	924.2205	903	2.732324e+02
c. 1d- 1w	44908	2956.3664	2275	1.721485e+03
d. 1-4w	1921	37481.1478	18453	4.299967e+04
e. > 4w	106	1292997.6981	643713	1.512685e+06

duration_group	n	mean	median	sd
a. <= 8h	266973	308.5608	312.0	9.992137e+01
b. 8-24h	283339	801.5121	740.0	2.505926e+02
c. 1d- 1w	47095	1952.6450	1798.0	6.149719e+02
d. 1-4w	427	53480.4215	33989.0	5.308332e+04
e. > 4w	26	914154.7308	503003.5	1.250441e+06

DU PHAM'S PORTFOLIO

1. Data Preparation: ¶

Read the data .csv files¶

See the data type and the first few row of data.¶

We want to find if there is any COLUMNS with null values¶

2. Analysis: ¶

We now partition the users into groups of specific time windows and carry the plots to compare the numbers of users between the casual and annual riders.¶

Here is a plot when the scales of each group are different to see clearer the groups d and f¶

3. Conclusion ¶

4. Further Consideration ¶

trip_id	start_time	end_time	bikeid	tripduration	from_station_id	from_station_name	to_station_id	to_station_name	usertype	gender	birthyear
25223640	2019-10-01 00:01:39	2019-10-01 00:17:20	2215	940	20	Sheffield Ave & Kingsbury St	309	Leavitt St & Armitage Ave	Subscriber	Male	1987
25223641	2019-10-01 00:02:16	2019-10-01 00:06:34	6328	258	19	Throop (Loomis) St & Taylor St	241	Morgan St & Polk St	Subscriber	Male	1998
25223642	2019-10-01 00:04:32	2019-10-01 00:18:43	3003	850	84	Milwaukee Ave & Grand Ave	199	Wabash Ave & Grand Ave	Subscriber	Female	1991
25223643	2019-10-01 00:04:32	2019-10-01 00:43:43	3275	2350	313	Lakeview Ave & Fullerton Pkwy	290	Kedzie Ave & Palmer Ct	Subscriber	Male	1990
25223644	2019-10-01 00:04:34	2019-10-01 00:35:42	5294	1867	210	Ashland Ave & Division St	382	Western Ave & Congress Pkwy	Subscriber	Male	1987
25223645	2019-10-01 00:04:38	2019-10-01 00:10:51	1891	373	156	Clark St & Wellington Ave	226	Racine Ave & Belmont Ave	Subscriber	Female	1994

Cyclistic Bike Share Analysis¶

1. Data Preparation: ¶

Read the data .csv files¶

See the data type and the first few row of data.¶

We want to find if there is any COLUMNS with null values¶

2. Analysis: ¶

We now partition the users into groups of specific time windows and carry the plots to compare the numbers of users between the casual and annual riders.¶

Here is a plot when the scales of each group are different to see clearer the groups d and f¶

3. Conclusion ¶

4. Further Consideration ¶