setwd("Z:/kaggle/taxi")
train_data=read.csv("train.csv")
head(train_data)
##          id vendor_id     pickup_datetime    dropoff_datetime
## 1 id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4 id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5 id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6 id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03
##   passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1               1        -73.98215        40.76794         -73.96463
## 2               1        -73.98042        40.73856         -73.99948
## 3               1        -73.97903        40.76394         -74.00533
## 4               1        -74.01004        40.71997         -74.01227
## 5               1        -73.97305        40.79321         -73.97292
## 6               6        -73.98286        40.74220         -73.99208
##   dropoff_latitude store_and_fwd_flag trip_duration
## 1         40.76560                  N           455
## 2         40.73115                  N           663
## 3         40.71009                  N          2124
## 4         40.70672                  N           429
## 5         40.78252                  N           435
## 6         40.74918                  N           443
############ Extract date elements using lubridate ##############
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
train_data$pickup_datetime<-as.character(train_data$pickup_datetime)
train_data$dropoff_datetime<-as.character(train_data$dropoff_datetime)

train_data$pickup_month<-month(train_data$pickup_datetime)
train_data$pickup_day<-day(train_data$pickup_datetime)
train_data$pickup_hour<-hour(train_data$pickup_datetime)
train_data$pickup_weekday<-wday(train_data$pickup_datetime, label=TRUE)

train_data$dropoff_month<-month(train_data$dropoff_datetime)
train_data$dropoff_day<-day(train_data$dropoff_datetime)
train_data$dropoff_hour<-hour(train_data$dropoff_datetime)
train_data$dropoff_weekday<-wday(train_data$dropoff_datetime, label=TRUE)

###############  Save modified files ##############
write.csv(train_data, file="train_data2.csv")
train_data<-read.csv("train_data2.csv")
head(train_data)
##   X        id vendor_id     pickup_datetime    dropoff_datetime
## 1 1 id2875421         2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 2 id2377394         1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 3 id3858529         2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4 4 id3504673         2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5 5 id2181028         2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6 6 id0801584         2 2016-01-30 22:01:40 2016-01-30 22:09:03
##   passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1               1        -73.98215        40.76794         -73.96463
## 2               1        -73.98042        40.73856         -73.99948
## 3               1        -73.97903        40.76394         -74.00533
## 4               1        -74.01004        40.71997         -74.01227
## 5               1        -73.97305        40.79321         -73.97292
## 6               6        -73.98286        40.74220         -73.99208
##   dropoff_latitude store_and_fwd_flag trip_duration pickup_month
## 1         40.76560                  N           455            3
## 2         40.73115                  N           663            6
## 3         40.71009                  N          2124            1
## 4         40.70672                  N           429            4
## 5         40.78252                  N           435            3
## 6         40.74918                  N           443            1
##   pickup_day pickup_hour pickup_weekday dropoff_month dropoff_day
## 1         14          17            Mon             3          14
## 2         12           0            Sun             6          12
## 3         19          11           Tues             1          19
## 4          6          19            Wed             4           6
## 5         26          13            Sat             3          26
## 6         30          22            Sat             1          30
##   dropoff_hour dropoff_weekday
## 1           17             Mon
## 2            0             Sun
## 3           12            Tues
## 4           19             Wed
## 5           13             Sat
## 6           22             Sat
######################## Visualize data through ggplot ggmap #####################
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(ggmap)

ggplot(data=train_data, aes(trip_duration))+geom_histogram(breaks=seq(0, 3000, by=100))

#gather all GPS coordinate for both pickup and dropoff with duration greater or equal to 1200 seconds

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
subset_data <- train_data %>% filter(trip_duration>1200)
dim(subset_data)
## [1] 295493     20
### x is longitude, y is latitude ###############
m <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=14&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6161.png
pickup<-ggmap(m) + geom_point(aes(x=pickup_longitude,y=pickup_latitude,size=trip_duration),colour="turquoise4", alpha=0.2,data=subset_data)+ggtitle("Pick up")
dropoff<-ggmap(m) + geom_point(aes(x=dropoff_longitude,y=dropoff_latitude,size=trip_duration),colour="indianred3", alpha=0.2,data=subset_data)+ggtitle("Drop off")
#################### Use gridExtra to arrange the two plots ################
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
grid.arrange(pickup, dropoff, ncol = 2)
## Warning: Removed 241413 rows containing missing values (geom_point).
## Warning: Removed 250027 rows containing missing values (geom_point).

####################### Plot contour with overlay ###########################

p <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=14&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'd784ee46494dc75749e77db70533b69b.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'f84317c0c1667b4334a0fe33382ee82d.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## '774f7fdcbcd9ef2693b629aeedeed083.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'e42469cdf73f009669fc5131b3be8ca2.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '4067dfd595aca34655452aff75a344f8.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'e08b3474cd7f9fcde35b19aa718c74f6.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '3f5add859e13ea3dcee6cc311b5dc241.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '6d9f27c44ae9363018c6a34887c63d3c.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'd2707e7bc06ff9d1b1334d7825fe60f2.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '53ce04cedeba12896d599137da760ebc.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '86f751cb08ea58fcfee82170f209dd08.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '6c2865afb5c5525be2500647bb97ed81.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'fa7db5525e3b14da76ee8419b6b5ff1a.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'aa26ac7e32370b46e6e9898ca4c03279.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## '1898641f8245f29b45b87e53e2c6c350.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'a5bd17d96f19978b21a0d6750e9a7c39.rds', reason 'No such file or directory'
data_pickup<-subset_data %>% select (starts_with("pickup_l"))
data_dropoff<-subset_data %>% select (starts_with("dropoff_l"))
colnames(data_pickup)<-c('lon', 'lat')
colnames(data_dropoff)<-c('lon', 'lat')
combo_data<-rbind(data.frame(data_pickup, group="pickup"), data.frame(data_dropoff, group="dropoff"))


ggmap(p)+ stat_density_2d(data=combo_data, geom ='polygon', bins=10, aes(x=lon, y=lat, fill=group, alpha=..level..))+
  scale_fill_manual(values=c('pickup'='orchid4', 'dropoff'='darkorange1'))+
  scale_alpha(guide = FALSE)
## Warning: Removed 491440 rows containing non-finite values (stat_density2d).

##################### Look at trip numbers by hours and day of week #####################

pickup_sum_data<-subset_data %>% 
  group_by(pickup_hour, pickup_weekday) %>%
  summarise(total = n())
  
head(pickup_sum_data)
## # A tibble: 6 x 3
## # Groups:   pickup_hour [1]
##   pickup_hour pickup_weekday total
##         <int>         <fctr> <int>
## 1           0            Fri  1774
## 2           0            Mon   683
## 3           0            Sat  2441
## 4           0            Sun  2252
## 5           0          Thurs  1079
## 6           0           Tues   807
#put weekday string values into level so they appear in order 
pickup_sum_data$pickup_weekday<-factor(pickup_sum_data$pickup_weekday, levels=c('Mon', 'Tues', 'Wed','Thurs','Fri','Sat','Sun'))
pickup_sum_plot<-ggplot(pickup_sum_data, aes(x = pickup_weekday , y = pickup_hour)) +
  geom_tile(aes(fill=total), colour = "white") +
  scale_fill_gradient(name = "# of pickups", low = "wheat2", high = "violetred4") +
  scale_y_continuous(breaks = unique(pickup_sum_data$pickup_hour)) +
  labs( x = "Day of week", y = "Hour") +
  theme(axis.ticks = element_blank())

############################ Hourly rides #################################
group_data<-subset_data %>% 
  group_by(pickup_hour) %>%
  summarise(total = n())

hourly_plot<-ggplot(group_data, aes(x=pickup_hour, y=total))+geom_area(fill="salmon4", alpha=0.5)+
  labs(y='Total number of rides')

grid.arrange(hourly_plot, pickup_sum_plot, ncol = 2)

###########################  Calendar Heatmap   ##########################
df<-subset_data %>% select(pickup_day, pickup_hour, pickup_month)%>%
  group_by(pickup_day, pickup_hour, pickup_month) %>%
  summarise(total=n()) 


library(plyr) # Use plyr to map month column values to letters
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:lubridate':
## 
##     here
df$pickup_month2 <- mapvalues(df$pickup_month,
                     from = c(1,2,3,4,5,6),
                     to = c("Jan", "Feb", "March", "April", "May", 'June'))

p <-ggplot(df,aes(x=pickup_day,y=pickup_hour, fill=total))+
  geom_tile(color="white", size=0.1) + coord_equal()+
  scale_fill_viridis(name="Number of rides", option="D")
#put facet plot in order
p <-p + facet_grid(.~factor(pickup_month2, levels=c("Jan", "Feb", "March", "April", "May", 'June')))
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$pickup_hour))
p <-p + labs( x="Pickup Day", y="Pickup Hour")
p