1 File Download

  • We manually downloaded all the files from https://hdyc.neis-one.org/.
  • Type osm id
  • right mouse click and go to line 89
  • Copy it to a text editor and
  • Save it as a .json file

2 1. File import

2.1 Call files into the R environment

filenames <- list.files("Contributors/", pattern="*.json", full.names=TRUE) 
# Import data
osm_json <-filenames %>% 
  map(read_json) %>% 
  reduce(bind_rows) %>% 
  mutate(document.id = row_number())

2.2 Take a glance

The entire variables can be seen.

# Take a look at the data
osm_json %>%  
  spread_all() %>% 
  colnames() 
##   [1] "document.id"                                
##   [2] "publicprofile"                              
##   [3] "contributor.name"                           
##   [4] "contributor.uid"                            
##   [5] "contributor.since"                          
##   [6] "contributor.traces"                         
##   [7] "contributor.blocks"                         
##   [8] "contributor.blocks_active"                  
##   [9] "contributor.img"                            
##  [10] "contributor.roles"                          
##  [11] "contributor.blocks_issued"                  
##  [12] "contributor.blocks_active_issued"           
##  [13] "contributor.organised_editing"              
##  [14] "node.f_id"                                  
##  [15] "node.f_tstamp"                              
##  [16] "node.f_lon"                                 
##  [17] "node.f_lat"                                 
##  [18] "node.l_id"                                  
##  [19] "node.l_tstamp"                              
##  [20] "node.l_lon"                                 
##  [21] "node.l_lat"                                 
##  [22] "nodes.c"                                    
##  [23] "nodes.m"                                    
##  [24] "nodes.d"                                    
##  [25] "nodes.lm"                                   
##  [26] "ways.c"                                     
##  [27] "ways.m"                                     
##  [28] "ways.d"                                     
##  [29] "ways.lm"                                    
##  [30] "waytags.c_addr"                             
##  [31] "waytags.m_addr"                             
##  [32] "waytags.c_building"                         
##  [33] "waytags.m_building"                         
##  [34] "waytags.c_highway"                          
##  [35] "waytags.m_highway"                          
##  [36] "waytags.c_landuse"                          
##  [37] "waytags.m_landuse"                          
##  [38] "waytags.c_name"                             
##  [39] "waytags.m_name"                             
##  [40] "waytags.c_natural"                          
##  [41] "waytags.m_natural"                          
##  [42] "waytags.c_waterway"                         
##  [43] "waytags.m_waterway"                         
##  [44] "relations.c"                                
##  [45] "relations.m"                                
##  [46] "relations.d"                                
##  [47] "relations.lm"                               
##  [48] "changesets.no"                              
##  [49] "changesets.changes"                         
##  [50] "changesets.max_gap_days"                    
##  [51] "changesets.mapping_days_year"               
##  [52] "changesets.info"                            
##  [53] "changesets.editors"                         
##  [54] "changesets.bboxs"                           
##  [55] "changesets.days"                            
##  [56] "changesets.hours"                           
##  [57] "changesets.mapping_days"                    
##  [58] "changesets.comment_wordcount"               
##  [59] "changesets.hashtag_wordcount"               
##  [60] "countries.countries"                        
##  [61] "notes.op"                                   
##  [62] "notes.co"                                   
##  [63] "notes.cl"                                   
##  [64] "notes.clco"                                 
##  [65] "discussion.dis_changesets"                  
##  [66] "discussion.com_changesets"                  
##  [67] "discussion.num_comments"                    
##  [68] "discussion.dis_responses"                   
##  [69] "changesets."                                
##  [70] "changesets.review_requested"                
##  [71] "lastmodifier.node_tags.ref"                 
##  [72] "lastmodifier.node_tags.name"                
##  [73] "lastmodifier.node_tags.shop"                
##  [74] "lastmodifier.node_tags.addr:"               
##  [75] "lastmodifier.node_tags.name:"               
##  [76] "lastmodifier.node_tags.place"               
##  [77] "lastmodifier.node_tags.power"               
##  [78] "lastmodifier.node_tags.sport"               
##  [79] "lastmodifier.node_tags.amenity"             
##  [80] "lastmodifier.node_tags.barrier"             
##  [81] "lastmodifier.node_tags.highway"             
##  [82] "lastmodifier.node_tags.leisure"             
##  [83] "lastmodifier.node_tags.natural"             
##  [84] "lastmodifier.node_tags.railway"             
##  [85] "lastmodifier.node_tags.surface"             
##  [86] "lastmodifier.node_tags.tourism"             
##  [87] "lastmodifier.node_tags.building"            
##  [88] "lastmodifier.node_tags.historic"            
##  [89] "lastmodifier.node_tags.man_made"            
##  [90] "lastmodifier.node_tags.waterway"            
##  [91] "lastmodifier.node_tags.building:"           
##  [92] "lastmodifier.node_tags.emergency"           
##  [93] "lastmodifier.node_tags.public_transport"    
##  [94] "lastmodifier.way_tags.ref"                  
##  [95] "lastmodifier.way_tags.name"                 
##  [96] "lastmodifier.way_tags.shop"                 
##  [97] "lastmodifier.way_tags.addr:"                
##  [98] "lastmodifier.way_tags.name:"                
##  [99] "lastmodifier.way_tags.place"                
## [100] "lastmodifier.way_tags.power"                
## [101] "lastmodifier.way_tags.sport"                
## [102] "lastmodifier.way_tags.amenity"              
## [103] "lastmodifier.way_tags.barrier"              
## [104] "lastmodifier.way_tags.highway"              
## [105] "lastmodifier.way_tags.landuse"              
## [106] "lastmodifier.way_tags.leisure"              
## [107] "lastmodifier.way_tags.natural"              
## [108] "lastmodifier.way_tags.railway"              
## [109] "lastmodifier.way_tags.surface"              
## [110] "lastmodifier.way_tags.tourism"              
## [111] "lastmodifier.way_tags.boundary"             
## [112] "lastmodifier.way_tags.building"             
## [113] "lastmodifier.way_tags.historic"             
## [114] "lastmodifier.way_tags.man_made"             
## [115] "lastmodifier.way_tags.waterway"             
## [116] "lastmodifier.way_tags.building:"            
## [117] "lastmodifier.way_tags.emergency"            
## [118] "lastmodifier.way_tags.public_transport"     
## [119] "lastmodifier.relation_tags.ref"             
## [120] "lastmodifier.relation_tags.name"            
## [121] "lastmodifier.relation_tags.shop"            
## [122] "lastmodifier.relation_tags.addr:"           
## [123] "lastmodifier.relation_tags.name:"           
## [124] "lastmodifier.relation_tags.route"           
## [125] "lastmodifier.relation_tags.amenity"         
## [126] "lastmodifier.relation_tags.highway"         
## [127] "lastmodifier.relation_tags.landuse"         
## [128] "lastmodifier.relation_tags.natural"         
## [129] "lastmodifier.relation_tags.surface"         
## [130] "lastmodifier.relation_tags.tourism"         
## [131] "lastmodifier.relation_tags.building"        
## [132] "lastmodifier.relation_tags.historic"        
## [133] "lastmodifier.relation_tags.building:"       
## [134] "lastmodifier.relation_tags.restriction"     
## [135] "lastmodifier.relation_tags.public_transport"
## [136] "accounts.OSM Wiki.id"                       
## [137] "accounts.OSM Wiki.Editcount"                
## [138] "accounts.OSM Help.id"                       
## [139] "accounts.OSM Help.Reputation"               
## [140] "accounts.OSM Forum.id"                      
## [141] "accounts.OSM Forum.Posts"                   
## [142] "accounts.Mapillary.id"                      
## [143] "accounts.Mapillary.Total Photos"            
## [144] "accounts.Mapillary.Total Distance"          
## [145] "lastmodifier.way_tags.route"                
## [146] "lastmodifier.relation_tags.boundary"        
## [147] "lastmodifier.relation_tags.waterway"        
## [148] "lastmodifier.relation_tags.emergency"       
## [149] "lastmodifier.node_tags.landuse"             
## [150] "lastmodifier.relation_tags.place"           
## [151] "lastmodifier.relation_tags.power"           
## [152] "lastmodifier.relation_tags.sport"           
## [153] "lastmodifier.relation_tags.leisure"         
## [154] "lastmodifier.relation_tags.man_made"        
## [155] "lastmodifier.relation_tags.addr"            
## [156] "lastmodifier.relation_tags.barrier"         
## [157] "accounts.OpenStreetCam.id"                  
## [158] "accounts.OpenStreetCam.Total Photos"        
## [159] "accounts.OpenStreetCam.Total Distance"      
## [160] "accounts.Github.id"                         
## [161] "accounts.Github.Contributions"              
## [162] "lastmodifier.node_tags.boundary"            
## [163] "lastmodifier.relation_tags.railway"         
## [164] "lastmodifier.way_tags.restriction"          
## [165] "lastmodifier.node_tags.restriction"         
## [166] "lastmodifier.node_tags.route"               
## [167] "accounts.OSM Streak.id"                     
## [168] "accounts.OSM Streak.Points"                 
## [169] "lastmodifier.node_tags.addr"                
## [170] "lastmodifier.way_tags.addr"                 
## [171] "..JSON"

The codes below simply indicates the first nodes, which is good to understand the overarching structure.

# browse Types 
osm_json %>% 
  gather_object %>% 
  json_types %>%
  count(name, type) %>% 
  print(n = Inf)
## # A tibble: 18 x 3
##    name          type       n
##    <chr>         <fct>  <int>
##  1 accounts      object    69
##  2 accounts      array    215
##  3 calendar      array    284
##  4 changesets    object   284
##  5 contributor   object   284
##  6 countries     object   284
##  7 discussion    object   284
##  8 lastmodifier  object   284
##  9 node          object   284
## 10 nodes         object   284
## 11 notes         object   284
## 12 publicprofile string   284
## 13 qa            array    284
## 14 ranks         array    284
## 15 recent        array    284
## 16 relations     object   284
## 17 ways          object   284
## 18 waytags       object   284

2.3 Select variables

Using enter_object(), gather_object(), and spread_values(), you can create and customise data.frames.

# Glance at the first index
osm_json %>% 
  enter_object('contributor') %>% 
  spread_values(name = jstring(name),
                 uid = jstring(uid),
                traces = jstring(traces),
                blocks = jstring(blocks)
                )
## # A tbl_json: 284 x 6 tibble with a "JSON" attribute
##    ..JSON                    document.id name         uid     traces blocks
##    <chr>                           <int> <chr>        <chr>   <chr>  <chr> 
##  1 "{\"name\":\"!i!\",\"..."           1 !i!          25720   55     0     
##  2 "{\"name\":\"<0174\"..."            2 <0174        172147  204    0     
##  3 "{\"name\":\"24dake..."             3 24dakenlo    3354215 0      0     
##  4 "{\"name\":\"ACM\",\"..."           4 ACM          20372   0      0     
##  5 "{\"name\":\"ACS198..."             5 ACS1986      2018957 54     0     
##  6 "{\"name\":\"Adam F..."             6 Adam Franco  27832   36     0     
##  7 "{\"name\":\"AkuAna..."             7 AkuAnakTimur 1407839 1388   0     
##  8 "{\"name\":\"alan_g..."             8 alan_gr      5307206 3      0     
##  9 "{\"name\":\"alarob..."             9 alarobric    525369  22     0     
## 10 "{\"name\":\"Alaska..."            10 AlaskaDave   473104  881    0     
## # … with 274 more rows

To avoid verbose coding, we can transform the entire json structure as a data.frame using the codes below.

# Now convert them to tibble
# Now convert them to tibble
osm_json %>% 
  spread_all() %>% 
  as_data_frame.tbl_json() %>% 
  mutate(contributor.since = as.Date(contributor.since)) -> osm_tibble

class(osm_tibble)
## [1] "tbl_df"     "tbl"        "data.frame"
##
duration <- as.numeric(osm_tibble$changesets.changes)
summary(duration)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       71    60362   230727  1178502  1022159 55758489
quantile(duration, c(.01, .05, .32, .57, .98)) 
##         1%         5%        32%        57%        98% 
##     326.77    2299.20   93648.92  330779.04 7851756.94
# Drop users who contributed less than 2000 changesets
osm_tibble %<>% 
  filter(changesets.changes >= 1500)


## year
lubridate::year(osm_tibble$contributor.since) %>% hist(main = "User's First Year of Contribution")

lubridate::year(osm_tibble$contributor.since) %>% table()
## .
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 
##    4    7   19   28   31   27   20   17   25   22   12   13   10

2.4 Import Excel files

# import Excel
survey_original <- readxl::read_xlsx("OSM survey data.xlsx") %>% select(-c(`(Found) Username`, `6.a. If you selected Other, please specify:`))
  
survey_original %>% 
  rename(username = `1. What is your OpenStreetMap Username?`,
         gender = `2. What gender do you identify as?`,
         age = `3. What is your age?`,
         country_residence = `4. What is your country of residence?`,
         nationality = `5. What is your nationality?`,
         education = `6. What is your highest level of education?`,
         continent = Continent,
         tz = `timezone(UTC)`
         ) -> survey

# Create a dataframe that contains all users names
data.frame(filenames) -> usernames_df

osm_tibble %>% 
  select(document.id, contributor.name) %>% 
  rename(username = contributor.name) -> osm_filtered

usernames_df %<>% 
  rename(username = filenames) %>% 
  mutate(username = gsub("Contributors//","", .$username)) %>% 
  mutate(username = gsub(".json","", .$username)) %>% 
  left_join(survey, by = "username") %>% 
  left_join(osm_filtered, by = "username") %>% 
  drop_na(document.id, gender) %>%
  filter(gender != "Prefer not to say") %>% 
  as_tibble()
dem_table <- table(usernames_df$gender, usernames_df$age)
dem_table %>% 
  t() %>% 
  as.data.frame.matrix() %>% 
  knitr::kable()
Female Male
>70 1 5
18-24 5 18
25-29 9 29
30-34 4 33
35-39 6 25
40-44 2 23
45-49 2 19
50-54 1 15
55-59 0 11
60-64 0 6
65-69 0 4
usernames_df %>% 
  group_by(gender) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
## # A tibble: 2 x 3
##   gender     n  freq
##   <chr>  <int> <dbl>
## 1 Female    30 0.138
## 2 Male     188 0.862
usernames_df %>% 
  group_by(gender, age) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 19 x 4
## # Groups:   gender [2]
##    gender age       n   freq
##    <chr>  <chr> <int>  <dbl>
##  1 Female >70       1 0.0333
##  2 Female 18-24     5 0.167 
##  3 Female 25-29     9 0.3   
##  4 Female 30-34     4 0.133 
##  5 Female 35-39     6 0.2   
##  6 Female 40-44     2 0.0667
##  7 Female 45-49     2 0.0667
##  8 Female 50-54     1 0.0333
##  9 Male   >70       5 0.0266
## 10 Male   18-24    18 0.0957
## 11 Male   25-29    29 0.154 
## 12 Male   30-34    33 0.176 
## 13 Male   35-39    25 0.133 
## 14 Male   40-44    23 0.122 
## 15 Male   45-49    19 0.101 
## 16 Male   50-54    15 0.0798
## 17 Male   55-59    11 0.0585
## 18 Male   60-64     6 0.0319
## 19 Male   65-69     4 0.0213

3 Profiles

usernames_df %>% 
  select(country_residence) %>% 
  table()
## .
##        Albania      Australia        Austria        Belarus        Belgium 
##              1              3              7              1              4 
##         Brazil         Canada       Colombia Czech Republic        Denmark 
##              2              7              2              3              1 
##       Ethiopia         France        Germany      Hong Kong          India 
##              1             13             50              1              1 
##      Indonesia          Italy          Japan      Lithuania       Malaysia 
##              1              3              3              1              1 
##         Mexico    Netherlands      Nicaragua         Norway    Philippines 
##              1              8              1              1              6 
##         Poland       Portugal         Russia       Slovakia          Spain 
##              3              1              6              1              1 
##         Sweden    Switzerland         Taiwan         Uganda             UK 
##              1              1              1              1             37 
##        Ukraine            USA 
##              1             41
usernames_df %>% 
  select(continent) %>% 
  table()
## .
##    Africa      Asia C.America    Europe N.America   Oceania S.America 
##         2        14         2       145        48         3         4

4 Days of Week

4.1 Data cleaning and summary table

osm_json %>%
  as_tbl_json(drop.nulljson = T) %>% 
  enter_object('changesets') %>%
  spread_values(days = jstring(days)) %>% 
  filter(document.id %in% usernames_df$document.id) %>%  
  as.data.frame() %>% 
  pull(2)-> days

data.frame(days) %>% 
  mutate(days = gsub("\\|$","", days)) %>%
  separate_rows(days, sep = "[|]") %>%
  separate(days, c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), ",") %>% 
  mutate(across(where(is.character), as.numeric)) -> days_df


days_df %>% colMeans() %>% round(.,0)
##  Sun  Mon  Tue  Wed  Thu  Fri  Sat 
## 1095 1017  941  948  942  963 1039
usernames_df %>%  
  bind_cols(days_df) %>% 
  mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54", "55-59", "60-64", "65-69", ">70")) %>% 
  as_tibble() -> days_df_combined


days_df_combined %>% 
  select(gender, age, Sun:Sat) %>% 
  group_by(gender, age) %>% 
  rowwise() %>% 
  mutate(weeksum = sum(c_across(Sun:Sat))) %>% 
  ungroup() %>% 
  mutate(across(Sun:Sat, ~ . / weeksum)) %>% 
  mutate_if(is.numeric, round, digits=3) %>% 
  select(-weeksum) -> days_df_per

4.2 Outcome

4.2.1 Results: Observation

  • Weekly Contribution (Averaged)

    • Male contribute nearly twice as more in a week than their female counterparts
    • The averaged total indicates that female contribute 100 changesets less during the weekend compared to weekdays
days_df_combined %>%
  select(gender, Sun:Sat) %>% 
  pivot_longer(!gender, names_to = "days", values_to = "count") %>% 
  group_by(gender) %>% 
  summarise(mean_cont_per_week = mean(count))
## # A tibble: 2 x 2
##   gender mean_cont_per_week
##   <chr>               <dbl>
## 1 Female               544.
## 2 Male                1064.
days_df_combined %>%
  filter(gender != "Prefer not to say") %>% 
  select(gender, age, Sun:Sat) %>% 
  rowwise() %>% 
  mutate(mean_cont_per_week = sum(c_across(where(is.numeric)), na.rm = T)) %>% 
  select(gender, age, mean_cont_per_week) %>%
  ggplot(aes(x = gender, y = mean_cont_per_week, fill = gender)) +
  geom_boxplot() +
  ylim(0,35000) +
  labs(x = "",
       y = "Averaged Number of 'Days Active'") +
  theme_bw() +
  theme(legend.position="bottom")
Gender Comparison

Gender Comparison

days_df_combined %>%
  select(gender, age, Sun:Sat) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  reshape2::melt(id = "gender", variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Averaged Number of 'Days Active'") +
  theme_bw() +
  theme(legend.position="bottom")

days_df_combined %>%
  select(gender, age, Sun:Sat) %>%
  group_by(gender,age) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~age, scales = "free_y") +
  labs(x = "",
       y = "Average Contributions During the Week") +
  theme_bw() +
  theme(legend.position="bottom",
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(-20,0,0,0),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

days_df_combined %>% 
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  select(-c(document.id, tz))
## # A tibble: 2 x 8
##   gender   Sun   Mon   Tue   Wed   Thu   Fri   Sat
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female  445.  594.  578.  580.  600.  557.  452.
## 2 Male   1199. 1084.  999. 1007.  997. 1028. 1133.
days_df_combined %>%
  filter(gender != "Prefer not to say") %>% 
  select(gender, age, Sun:Sat) %>% 
  rowwise() %>% 
  mutate(mean_cont_per_week = sum(c_across(where(is.numeric)), na.rm = T)) %>% 
  select(gender, age, mean_cont_per_week) %>%
  ggplot(aes(x = gender, y = mean_cont_per_week, fill = age)) +
  geom_boxplot() +
  ylim(0,35000) +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom")
Gender Comparison by Age group

Gender Comparison by Age group

4.2.2 Results: Ratio

  • Weekly Contribution based on ratio

    • Similar results happened as the observation results
    • The ratio gives more clearer results that women, albeit a small sample (n=25), contribute 5-7% less in the weekends compared to weekdays
days_df_per %>% 
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  mutate_if(is.numeric, round, digits=3) 
## # A tibble: 2 x 8
##   gender   Sun   Mon   Tue   Wed   Thu   Fri   Sat
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 0.091 0.143 0.152 0.163 0.17  0.183 0.098
## 2 Male   0.155 0.146 0.14  0.14  0.135 0.142 0.142
days_df_per %>% 
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  mutate_if(is.numeric, round, digits=3) %>% 
  reshape2::melt(id = "gender", variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active (%)") +
  theme_bw() +
  theme(legend.position="bottom",
        text = element_text(size=15))

days_df_per %>% 
  group_by(gender, age) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  mutate_if(is.numeric, round, digits=3) %>% 
  reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  geom_hline(yintercept = .142, linetype = "dashed") +
  facet_wrap(~age, scales = "free_y") +
  labs(x = "",
       y = "Days Active (%)") +
  theme_bw() +
  theme(legend.position="bottom",
        legend.margin=margin(0,0,0,0),
        legend.box.margin=margin(-20,0,0,0),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `mutate_if()` ignored the following grouping variables:
## Column `gender`

5 Hours of day

osm_json %>%
  as_tbl_json(drop.nulljson = T) %>% 
  enter_object('changesets') %>%
  spread_values(hours = jstring(hours)) %>% 
  filter(document.id %in% usernames_df$document.id) %>%  
  as.data.frame() %>% 
  pull(2)-> hours

data.frame(hours) %>% 
  mutate(hours = gsub("\\|$","", hours)) %>%
  separate_rows(hours, sep = "[|]") %>%
  separate(hours, c("h00", "h01", "h02", "h03", "h04", "h05",
                    "h06", "h07", "h08", "h09", "h10", "h11",
                    "h12", "h13", "h14", "h15", "h16", "h17",
                    "h18", "h19", "h20", "h21", "h22", "h23"
  ), ",") %>%
  mutate(across(where(is.character), as.numeric)) -> hours_df


usernames_df %>% 
  bind_cols(hours_df) %>% 
  mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54", "55-59", "60-64", "65-69", ">70")) -> hours_df_combined

hours_df_combined %<>% 
  select(document.id, tz, h00:h23) %>% 
  pivot_longer(!c(document.id, tz), names_to = "UTC", values_to = "Count") %>% 
  mutate(UTC = as.numeric(str_sub(UTC, -2)),
         local = tz + UTC,
         local = case_when(local < 24 & local >= 0 ~ local, 
                           local >= 24 ~ local - 24,
                           local < 0 ~ local + 24),
         local = str_c("h0", local),
         local = ifelse(local == "h00"|
                          local == "h01"|
                          local == "h02"|
                          local == "h03"|
                          local == "h04"|
                          local == "h05"|
                          local == "h06"|
                          local == "h07"|
                          local == "h08"|
                          local == "h09", local, paste0("h", str_sub(local,-2)) 
         )) %>% 
  select(document.id, Count, local, -tz) %>% 
  arrange(document.id, local, Count) %>% 
  pivot_wider(names_from = local, values_from = Count) %>% 
  left_join(hours_df_combined %>% select(username:document.id), by = "document.id") 
hours_df_combined %>% 
  select(gender, age, h00:h23) %>% 
  group_by(gender, age) %>% 
  rowwise() %>% 
  mutate(dailysum = sum(c_across(h00:h23))) %>% 
  ungroup() %>% 
  mutate(across(h00:h23, ~ . / dailysum)) %>% 
  mutate_if(is.numeric, round, digits=3) %>% 
  select(-dailysum) -> hours_df_per

5.1 Summary Statistics

hours_df_combined %>%
  select(gender, h00:h23) %>% 
  pivot_longer(!gender, names_to = "Hours", values_to = "count") %>% 
  group_by(gender) %>% 
  summarise(mean = mean(count))
## # A tibble: 2 x 2
##   gender  mean
##   <chr>  <dbl>
## 1 Female  159.
## 2 Male    310.
hours_df_combined %>%
  select(gender, h00:h23) %>% 
  pivot_longer(!gender, names_to = "Hours", values_to = "count") %>% 
  group_by(gender, Hours) %>% 
  summarise(mean = mean(count)) %>% 
  pivot_wider(names_from = gender, values_from = mean) %>% 
  print(n = Inf)
## # A tibble: 24 x 3
##    Hours Female  Male
##    <chr>  <dbl> <dbl>
##  1 h00     45.8 229. 
##  2 h01     20.5 154. 
##  3 h02     14.6  95.8
##  4 h03     35.0  61.2
##  5 h04    102.   59.9
##  6 h05    167.   80.5
##  7 h06    120.  150. 
##  8 h07     72.6 227. 
##  9 h08    112.  282. 
## 10 h09    197.  326. 
## 11 h10    264.  372. 
## 12 h11    270.  395. 
## 13 h12    210.  395. 
## 14 h13    246.  415. 
## 15 h14    260.  439. 
## 16 h15    258.  421. 
## 17 h16    238.  431. 
## 18 h17    216.  441. 
## 19 h18    186.  461. 
## 20 h19    173.  449. 
## 21 h20    181.  432. 
## 22 h21    183.  418. 
## 23 h22    143.  389. 
## 24 h23     89.8 325.

5.2 Results: Observations

hours_df_combined %>%
  select(gender, age, h00:h23) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom")

UK

hours_df_combined %>%
  filter(country_residence == "UK") %>% 
  select(gender, age, h00:h23) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom")

USA

hours_df_combined %>%
  filter(country_residence == "USA") %>% 
  select(gender, age, h00:h23) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom")

Germany

hours_df_combined %>%
  filter(country_residence == "Germany") %>% 
  select(gender, age, h00:h23) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom")

hours_df_combined %>%
  select(gender, age, h00:h23) %>%
  group_by(age) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "age", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = age, colour = age)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

hours_df_combined %>%
  select(gender, age, h00:h23) %>%
  filter(age != ">70") %>% # <- remove >70 aged group
  group_by(age, gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = age, colour = age)) +
  geom_point() + 
  geom_line() +
  facet_grid(rows = vars(gender)) +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

hours_df_combined %>%
  select(gender, age, h00:h23) %>%
  group_by(age, gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~age, scales = "free_y", ncol = 3) +
  labs(x = "",
       y = "Days Active") +
  theme_bw() +
  theme(legend.position="bottom",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

5.3 Results: Ratio

hours_df_per %>%
  select(gender, age, h00:h23) %>%
  group_by(gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>%  
  reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>% 
  ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  labs(x = "",
       y = "Days Active(%)") +
  theme_bw() +
  theme(legend.position="bottom")

hours_df_per %>%
  select(gender, age, h00:h23) %>%
  group_by(age, gender) %>% 
  summarise_if(is.numeric, mean, na.rm = TRUE) %>% 
  reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>% 
  ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
  geom_point() + 
  geom_line() +
  facet_wrap(~age, scales = "free_y", ncol = 3) +
  labs(x = "",
       y = "Days Active(%)") +
  theme_bw() +
  theme(legend.position="bottom",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

6 Bbboxes: How geospatially wide do people contribute in OSM

6.1 Summary Statistics

osm_json %>%
  as_tbl_json(drop.nulljson = T) %>% 
  enter_object('changesets') %>%
  spread_values(number = jstring(bboxs)) %>% 
  filter(document.id %in% usernames_df$document.id) %>%  
  as.data.frame() %>% 
  pull(2)-> bboxs


data.frame(bboxs) %>% 
  mutate(bboxs = gsub("\\|$","", bboxs)) %>%
  separate_rows(bboxs, sep = ";") %>%
  filter(row_number() %% 3 == 1) %>% 
  mutate(across(where(is.character), as.numeric)) -> bboxs_df

usernames_df %>% 
  bind_cols(bboxs_df) %>% 
  drop_na(gender) %>% 
  mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54", "55-59", "60-64", "65-69", ">70"))  -> bboxs_df_combined

6.2 Outcome

bboxs_df_combined %>%
  select(gender, age, bboxs) %>% 
  ggplot(aes(x = gender, y = bboxs, fill = gender)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 15000), breaks = seq(0, 15000, by = 1000)) +
  labs(x = "",
       y = "Spatial Extent of the Users' Main Activity Area (km)") +
  theme_bw() +
  theme(legend.position="bottom")
## Warning: Removed 24 rows containing non-finite values (stat_boxplot).

bboxs_df_combined %>%
  select(gender, age, bboxs) %>%
  group_by(gender, age) %>% 
  summarise(mean_bboxs = mean(bboxs),
            observations = paste0("n= ", n())) %>% 
  mutate(name = paste(gender, age, sep = "  ")) %>% 
  ggplot(aes(x= reorder(name, mean_bboxs), y= mean_bboxs)) +
  geom_segment(aes(xend=name, yend=0)) +
  geom_point(size=4, aes(group = observations, color=gender)) +
  directlabels::geom_dl(aes(label = observations), method = list(directlabels::dl.trans(x = x + 0.3), "last.points", cex = 0.8)) +
  coord_flip() +
  ylim(0,25000) +
  labs(x = "",
       y = "Spatial Extent of the Users' Main Activity Area (km)") +
  theme_bw() +
  xlab("") +
  theme(legend.position="bottom")
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.

7 Mapping Countries

osm_json %>%
  as_tbl_json(drop.nulljson = T) %>% 
  enter_object('countries') %>%
  spread_values(number = jstring(countries)) %>% 
  filter(document.id %in% usernames_df$document.id) %>%  
  as.data.frame() %>% 
  pull(2)-> countries


data.frame(No_of_Country = str_count(countries, pattern = ";")) -> countries_df

usernames_df %>% 
  bind_cols(countries_df) %>% 
  mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54", "55-59", "60-64", "65-69", ">70"))-> countries_df_combined
countries_df_combined %>%
  select(gender, age, No_of_Country) %>% 
  ggplot(aes(x = gender, y = No_of_Country, fill = gender)) +
  geom_violin(width=1) +
  geom_boxplot(width = 0.1, alpha=0.2) +
  scale_y_continuous(limits = c(0, 300), breaks = seq(0, 300, by = 50)) +
  labs(x = "",
       y = "Number of Countries Contributed") +
  theme_bw() +
  theme(legend.position="bottom")

countries_df_combined %>%
  select(gender, age, No_of_Country) %>% 
  ggplot(aes(x = age, y = No_of_Country, fill = gender)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(0, 250), breaks = seq(0, 250, by = 50)) +
  labs(x = "",
       y = "Number of Countries Contributed") +
  theme_bw() +
  theme(legend.position="bottom")