setwd("Z:/kaggle/taxi")
train_data=read.csv("train.csv")
head(train_data)
## id vendor_id pickup_datetime dropoff_datetime
## 1 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1 1 -73.98215 40.76794 -73.96463
## 2 1 -73.98042 40.73856 -73.99948
## 3 1 -73.97903 40.76394 -74.00533
## 4 1 -74.01004 40.71997 -74.01227
## 5 1 -73.97305 40.79321 -73.97292
## 6 6 -73.98286 40.74220 -73.99208
## dropoff_latitude store_and_fwd_flag trip_duration
## 1 40.76560 N 455
## 2 40.73115 N 663
## 3 40.71009 N 2124
## 4 40.70672 N 429
## 5 40.78252 N 435
## 6 40.74918 N 443
############ Extract date elements using lubridate ##############
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
train_data$pickup_datetime<-as.character(train_data$pickup_datetime)
train_data$dropoff_datetime<-as.character(train_data$dropoff_datetime)
train_data$pickup_month<-month(train_data$pickup_datetime)
train_data$pickup_day<-day(train_data$pickup_datetime)
train_data$pickup_hour<-hour(train_data$pickup_datetime)
train_data$pickup_weekday<-wday(train_data$pickup_datetime, label=TRUE)
train_data$dropoff_month<-month(train_data$dropoff_datetime)
train_data$dropoff_day<-day(train_data$dropoff_datetime)
train_data$dropoff_hour<-hour(train_data$dropoff_datetime)
train_data$dropoff_weekday<-wday(train_data$dropoff_datetime, label=TRUE)
############### Save modified files ##############
write.csv(train_data, file="train_data2.csv")
train_data<-read.csv("train_data2.csv")
head(train_data)
## X id vendor_id pickup_datetime dropoff_datetime
## 1 1 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30
## 2 2 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38
## 3 3 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48
## 4 4 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40
## 5 5 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10
## 6 6 id0801584 2 2016-01-30 22:01:40 2016-01-30 22:09:03
## passenger_count pickup_longitude pickup_latitude dropoff_longitude
## 1 1 -73.98215 40.76794 -73.96463
## 2 1 -73.98042 40.73856 -73.99948
## 3 1 -73.97903 40.76394 -74.00533
## 4 1 -74.01004 40.71997 -74.01227
## 5 1 -73.97305 40.79321 -73.97292
## 6 6 -73.98286 40.74220 -73.99208
## dropoff_latitude store_and_fwd_flag trip_duration pickup_month
## 1 40.76560 N 455 3
## 2 40.73115 N 663 6
## 3 40.71009 N 2124 1
## 4 40.70672 N 429 4
## 5 40.78252 N 435 3
## 6 40.74918 N 443 1
## pickup_day pickup_hour pickup_weekday dropoff_month dropoff_day
## 1 14 17 Mon 3 14
## 2 12 0 Sun 6 12
## 3 19 11 Tues 1 19
## 4 6 19 Wed 4 6
## 5 26 13 Sat 3 26
## 6 30 22 Sat 1 30
## dropoff_hour dropoff_weekday
## 1 17 Mon
## 2 0 Sun
## 3 12 Tues
## 4 19 Wed
## 5 13 Sat
## 6 22 Sat
######################## Visualize data through ggplot ggmap #####################
library(ggplot2)
library(viridis)
## Loading required package: viridisLite
library(ggmap)
ggplot(data=train_data, aes(trip_duration))+geom_histogram(breaks=seq(0, 3000, by=100))

#gather all GPS coordinate for both pickup and dropoff with duration greater or equal to 1200 seconds
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
subset_data <- train_data %>% filter(trip_duration>1200)
dim(subset_data)
## [1] 295493 20
### x is longitude, y is latitude ###############
m <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=14&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6158.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6159.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6160.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6161.png
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6161.png
pickup<-ggmap(m) + geom_point(aes(x=pickup_longitude,y=pickup_latitude,size=trip_duration),colour="turquoise4", alpha=0.2,data=subset_data)+ggtitle("Pick up")
dropoff<-ggmap(m) + geom_point(aes(x=dropoff_longitude,y=dropoff_latitude,size=trip_duration),colour="indianred3", alpha=0.2,data=subset_data)+ggtitle("Drop off")
#################### Use gridExtra to arrange the two plots ################
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
grid.arrange(pickup, dropoff, ncol = 2)
## Warning: Removed 241413 rows containing missing values (geom_point).
## Warning: Removed 250027 rows containing missing values (geom_point).

####################### Plot contour with overlay ###########################
p <- get_map("New York City",zoom=14,maptype='toner-lite',source="stamen")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=New+York+City&zoom=14&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=New%20York%20City&sensor=false
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'd784ee46494dc75749e77db70533b69b.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'f84317c0c1667b4334a0fe33382ee82d.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## '774f7fdcbcd9ef2693b629aeedeed083.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6158.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'e42469cdf73f009669fc5131b3be8ca2.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '4067dfd595aca34655452aff75a344f8.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'e08b3474cd7f9fcde35b19aa718c74f6.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '3f5add859e13ea3dcee6cc311b5dc241.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6159.png
## Warning in file.remove(index[[url]]): cannot remove file
## '6d9f27c44ae9363018c6a34887c63d3c.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'd2707e7bc06ff9d1b1334d7825fe60f2.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '53ce04cedeba12896d599137da760ebc.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '86f751cb08ea58fcfee82170f209dd08.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6160.png
## Warning in file.remove(index[[url]]): cannot remove file
## '6c2865afb5c5525be2500647bb97ed81.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4822/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'fa7db5525e3b14da76ee8419b6b5ff1a.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4823/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'aa26ac7e32370b46e6e9898ca4c03279.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4824/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## '1898641f8245f29b45b87e53e2c6c350.rds', reason 'No such file or directory'
## Map from URL : http://tile.stamen.com/toner-lite/14/4825/6161.png
## Warning in file.remove(index[[url]]): cannot remove file
## 'a5bd17d96f19978b21a0d6750e9a7c39.rds', reason 'No such file or directory'
data_pickup<-subset_data %>% select (starts_with("pickup_l"))
data_dropoff<-subset_data %>% select (starts_with("dropoff_l"))
colnames(data_pickup)<-c('lon', 'lat')
colnames(data_dropoff)<-c('lon', 'lat')
combo_data<-rbind(data.frame(data_pickup, group="pickup"), data.frame(data_dropoff, group="dropoff"))
ggmap(p)+ stat_density_2d(data=combo_data, geom ='polygon', bins=10, aes(x=lon, y=lat, fill=group, alpha=..level..))+
scale_fill_manual(values=c('pickup'='orchid4', 'dropoff'='darkorange1'))+
scale_alpha(guide = FALSE)
## Warning: Removed 491440 rows containing non-finite values (stat_density2d).

##################### Look at trip numbers by hours and day of week #####################
pickup_sum_data<-subset_data %>%
group_by(pickup_hour, pickup_weekday) %>%
summarise(total = n())
head(pickup_sum_data)
## # A tibble: 6 x 3
## # Groups: pickup_hour [1]
## pickup_hour pickup_weekday total
## <int> <fctr> <int>
## 1 0 Fri 1774
## 2 0 Mon 683
## 3 0 Sat 2441
## 4 0 Sun 2252
## 5 0 Thurs 1079
## 6 0 Tues 807
#put weekday string values into level so they appear in order
pickup_sum_data$pickup_weekday<-factor(pickup_sum_data$pickup_weekday, levels=c('Mon', 'Tues', 'Wed','Thurs','Fri','Sat','Sun'))
pickup_sum_plot<-ggplot(pickup_sum_data, aes(x = pickup_weekday , y = pickup_hour)) +
geom_tile(aes(fill=total), colour = "white") +
scale_fill_gradient(name = "# of pickups", low = "wheat2", high = "violetred4") +
scale_y_continuous(breaks = unique(pickup_sum_data$pickup_hour)) +
labs( x = "Day of week", y = "Hour") +
theme(axis.ticks = element_blank())
############################ Hourly rides #################################
group_data<-subset_data %>%
group_by(pickup_hour) %>%
summarise(total = n())
hourly_plot<-ggplot(group_data, aes(x=pickup_hour, y=total))+geom_area(fill="salmon4", alpha=0.5)+
labs(y='Total number of rides')
grid.arrange(hourly_plot, pickup_sum_plot, ncol = 2)

########################### Calendar Heatmap ##########################
df<-subset_data %>% select(pickup_day, pickup_hour, pickup_month)%>%
group_by(pickup_day, pickup_hour, pickup_month) %>%
summarise(total=n())
library(plyr) # Use plyr to map month column values to letters
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:lubridate':
##
## here
df$pickup_month2 <- mapvalues(df$pickup_month,
from = c(1,2,3,4,5,6),
to = c("Jan", "Feb", "March", "April", "May", 'June'))
p <-ggplot(df,aes(x=pickup_day,y=pickup_hour, fill=total))+
geom_tile(color="white", size=0.1) + coord_equal()+
scale_fill_viridis(name="Number of rides", option="D")
#put facet plot in order
p <-p + facet_grid(.~factor(pickup_month2, levels=c("Jan", "Feb", "March", "April", "May", 'June')))
p <-p + scale_y_continuous(trans = "reverse", breaks = unique(df$pickup_hour))
p <-p + labs( x="Pickup Day", y="Pickup Hour")
p
