right mouse click
and go to line 89.json
filefilenames <- list.files("Contributors/", pattern="*.json", full.names=TRUE)
# Import data
osm_json <-filenames %>%
map(read_json) %>%
reduce(bind_rows) %>%
mutate(document.id = row_number())
The entire variables can be seen.
# Take a look at the data
osm_json %>%
spread_all() %>%
colnames()
## [1] "document.id"
## [2] "publicprofile"
## [3] "contributor.name"
## [4] "contributor.uid"
## [5] "contributor.since"
## [6] "contributor.traces"
## [7] "contributor.blocks"
## [8] "contributor.blocks_active"
## [9] "contributor.img"
## [10] "contributor.roles"
## [11] "contributor.blocks_issued"
## [12] "contributor.blocks_active_issued"
## [13] "contributor.organised_editing"
## [14] "node.f_id"
## [15] "node.f_tstamp"
## [16] "node.f_lon"
## [17] "node.f_lat"
## [18] "node.l_id"
## [19] "node.l_tstamp"
## [20] "node.l_lon"
## [21] "node.l_lat"
## [22] "nodes.c"
## [23] "nodes.m"
## [24] "nodes.d"
## [25] "nodes.lm"
## [26] "ways.c"
## [27] "ways.m"
## [28] "ways.d"
## [29] "ways.lm"
## [30] "waytags.c_addr"
## [31] "waytags.m_addr"
## [32] "waytags.c_building"
## [33] "waytags.m_building"
## [34] "waytags.c_highway"
## [35] "waytags.m_highway"
## [36] "waytags.c_landuse"
## [37] "waytags.m_landuse"
## [38] "waytags.c_name"
## [39] "waytags.m_name"
## [40] "waytags.c_natural"
## [41] "waytags.m_natural"
## [42] "waytags.c_waterway"
## [43] "waytags.m_waterway"
## [44] "relations.c"
## [45] "relations.m"
## [46] "relations.d"
## [47] "relations.lm"
## [48] "changesets.no"
## [49] "changesets.changes"
## [50] "changesets.max_gap_days"
## [51] "changesets.mapping_days_year"
## [52] "changesets.info"
## [53] "changesets.editors"
## [54] "changesets.bboxs"
## [55] "changesets.days"
## [56] "changesets.hours"
## [57] "changesets.mapping_days"
## [58] "changesets.comment_wordcount"
## [59] "changesets.hashtag_wordcount"
## [60] "countries.countries"
## [61] "notes.op"
## [62] "notes.co"
## [63] "notes.cl"
## [64] "notes.clco"
## [65] "discussion.dis_changesets"
## [66] "discussion.com_changesets"
## [67] "discussion.num_comments"
## [68] "discussion.dis_responses"
## [69] "changesets."
## [70] "changesets.review_requested"
## [71] "lastmodifier.node_tags.ref"
## [72] "lastmodifier.node_tags.name"
## [73] "lastmodifier.node_tags.shop"
## [74] "lastmodifier.node_tags.addr:"
## [75] "lastmodifier.node_tags.name:"
## [76] "lastmodifier.node_tags.place"
## [77] "lastmodifier.node_tags.power"
## [78] "lastmodifier.node_tags.sport"
## [79] "lastmodifier.node_tags.amenity"
## [80] "lastmodifier.node_tags.barrier"
## [81] "lastmodifier.node_tags.highway"
## [82] "lastmodifier.node_tags.leisure"
## [83] "lastmodifier.node_tags.natural"
## [84] "lastmodifier.node_tags.railway"
## [85] "lastmodifier.node_tags.surface"
## [86] "lastmodifier.node_tags.tourism"
## [87] "lastmodifier.node_tags.building"
## [88] "lastmodifier.node_tags.historic"
## [89] "lastmodifier.node_tags.man_made"
## [90] "lastmodifier.node_tags.waterway"
## [91] "lastmodifier.node_tags.building:"
## [92] "lastmodifier.node_tags.emergency"
## [93] "lastmodifier.node_tags.public_transport"
## [94] "lastmodifier.way_tags.ref"
## [95] "lastmodifier.way_tags.name"
## [96] "lastmodifier.way_tags.shop"
## [97] "lastmodifier.way_tags.addr:"
## [98] "lastmodifier.way_tags.name:"
## [99] "lastmodifier.way_tags.place"
## [100] "lastmodifier.way_tags.power"
## [101] "lastmodifier.way_tags.sport"
## [102] "lastmodifier.way_tags.amenity"
## [103] "lastmodifier.way_tags.barrier"
## [104] "lastmodifier.way_tags.highway"
## [105] "lastmodifier.way_tags.landuse"
## [106] "lastmodifier.way_tags.leisure"
## [107] "lastmodifier.way_tags.natural"
## [108] "lastmodifier.way_tags.railway"
## [109] "lastmodifier.way_tags.surface"
## [110] "lastmodifier.way_tags.tourism"
## [111] "lastmodifier.way_tags.boundary"
## [112] "lastmodifier.way_tags.building"
## [113] "lastmodifier.way_tags.historic"
## [114] "lastmodifier.way_tags.man_made"
## [115] "lastmodifier.way_tags.waterway"
## [116] "lastmodifier.way_tags.building:"
## [117] "lastmodifier.way_tags.emergency"
## [118] "lastmodifier.way_tags.public_transport"
## [119] "lastmodifier.relation_tags.ref"
## [120] "lastmodifier.relation_tags.name"
## [121] "lastmodifier.relation_tags.shop"
## [122] "lastmodifier.relation_tags.addr:"
## [123] "lastmodifier.relation_tags.name:"
## [124] "lastmodifier.relation_tags.route"
## [125] "lastmodifier.relation_tags.amenity"
## [126] "lastmodifier.relation_tags.highway"
## [127] "lastmodifier.relation_tags.landuse"
## [128] "lastmodifier.relation_tags.natural"
## [129] "lastmodifier.relation_tags.surface"
## [130] "lastmodifier.relation_tags.tourism"
## [131] "lastmodifier.relation_tags.building"
## [132] "lastmodifier.relation_tags.historic"
## [133] "lastmodifier.relation_tags.building:"
## [134] "lastmodifier.relation_tags.restriction"
## [135] "lastmodifier.relation_tags.public_transport"
## [136] "accounts.OSM Wiki.id"
## [137] "accounts.OSM Wiki.Editcount"
## [138] "accounts.OSM Help.id"
## [139] "accounts.OSM Help.Reputation"
## [140] "accounts.OSM Forum.id"
## [141] "accounts.OSM Forum.Posts"
## [142] "accounts.Mapillary.id"
## [143] "accounts.Mapillary.Total Photos"
## [144] "accounts.Mapillary.Total Distance"
## [145] "lastmodifier.way_tags.route"
## [146] "lastmodifier.relation_tags.boundary"
## [147] "lastmodifier.relation_tags.waterway"
## [148] "lastmodifier.relation_tags.emergency"
## [149] "lastmodifier.node_tags.landuse"
## [150] "lastmodifier.relation_tags.place"
## [151] "lastmodifier.relation_tags.power"
## [152] "lastmodifier.relation_tags.sport"
## [153] "lastmodifier.relation_tags.leisure"
## [154] "lastmodifier.relation_tags.man_made"
## [155] "lastmodifier.relation_tags.addr"
## [156] "lastmodifier.relation_tags.barrier"
## [157] "accounts.OpenStreetCam.id"
## [158] "accounts.OpenStreetCam.Total Photos"
## [159] "accounts.OpenStreetCam.Total Distance"
## [160] "accounts.Github.id"
## [161] "accounts.Github.Contributions"
## [162] "lastmodifier.node_tags.boundary"
## [163] "lastmodifier.relation_tags.railway"
## [164] "lastmodifier.way_tags.restriction"
## [165] "lastmodifier.node_tags.restriction"
## [166] "lastmodifier.node_tags.route"
## [167] "accounts.OSM Streak.id"
## [168] "accounts.OSM Streak.Points"
## [169] "lastmodifier.node_tags.addr"
## [170] "lastmodifier.way_tags.addr"
## [171] "..JSON"
The codes below simply indicates the first nodes, which is good to understand the overarching structure.
# browse Types
osm_json %>%
gather_object %>%
json_types %>%
count(name, type) %>%
print(n = Inf)
## # A tibble: 18 x 3
## name type n
## <chr> <fct> <int>
## 1 accounts object 69
## 2 accounts array 215
## 3 calendar array 284
## 4 changesets object 284
## 5 contributor object 284
## 6 countries object 284
## 7 discussion object 284
## 8 lastmodifier object 284
## 9 node object 284
## 10 nodes object 284
## 11 notes object 284
## 12 publicprofile string 284
## 13 qa array 284
## 14 ranks array 284
## 15 recent array 284
## 16 relations object 284
## 17 ways object 284
## 18 waytags object 284
Using enter_object()
, gather_object()
, and spread_values()
, you can create and customise data.frame
s.
# Glance at the first index
osm_json %>%
enter_object('contributor') %>%
spread_values(name = jstring(name),
uid = jstring(uid),
traces = jstring(traces),
blocks = jstring(blocks)
)
## # A tbl_json: 284 x 6 tibble with a "JSON" attribute
## ..JSON document.id name uid traces blocks
## <chr> <int> <chr> <chr> <chr> <chr>
## 1 "{\"name\":\"!i!\",\"..." 1 !i! 25720 55 0
## 2 "{\"name\":\"<0174\"..." 2 <0174 172147 204 0
## 3 "{\"name\":\"24dake..." 3 24dakenlo 3354215 0 0
## 4 "{\"name\":\"ACM\",\"..." 4 ACM 20372 0 0
## 5 "{\"name\":\"ACS198..." 5 ACS1986 2018957 54 0
## 6 "{\"name\":\"Adam F..." 6 Adam Franco 27832 36 0
## 7 "{\"name\":\"AkuAna..." 7 AkuAnakTimur 1407839 1388 0
## 8 "{\"name\":\"alan_g..." 8 alan_gr 5307206 3 0
## 9 "{\"name\":\"alarob..." 9 alarobric 525369 22 0
## 10 "{\"name\":\"Alaska..." 10 AlaskaDave 473104 881 0
## # … with 274 more rows
To avoid verbose coding, we can transform the entire json structure as a data.frame
using the codes below.
# Now convert them to tibble
# Now convert them to tibble
osm_json %>%
spread_all() %>%
as_data_frame.tbl_json() %>%
mutate(contributor.since = as.Date(contributor.since)) -> osm_tibble
class(osm_tibble)
## [1] "tbl_df" "tbl" "data.frame"
##
duration <- as.numeric(osm_tibble$changesets.changes)
summary(duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71 60362 230727 1178502 1022159 55758489
quantile(duration, c(.01, .05, .32, .57, .98))
## 1% 5% 32% 57% 98%
## 326.77 2299.20 93648.92 330779.04 7851756.94
# Drop users who contributed less than 2000 changesets
osm_tibble %<>%
filter(changesets.changes >= 1500)
## year
lubridate::year(osm_tibble$contributor.since) %>% hist(main = "User's First Year of Contribution")
lubridate::year(osm_tibble$contributor.since) %>% table()
## .
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 4 7 19 28 31 27 20 17 25 22 12 13 10
# import Excel
survey_original <- readxl::read_xlsx("OSM survey data.xlsx") %>% select(-c(`(Found) Username`, `6.a. If you selected Other, please specify:`))
survey_original %>%
rename(username = `1. What is your OpenStreetMap Username?`,
gender = `2. What gender do you identify as?`,
age = `3. What is your age?`,
country_residence = `4. What is your country of residence?`,
nationality = `5. What is your nationality?`,
education = `6. What is your highest level of education?`,
continent = Continent,
tz = `timezone(UTC)`
) -> survey
# Create a dataframe that contains all users names
data.frame(filenames) -> usernames_df
osm_tibble %>%
select(document.id, contributor.name) %>%
rename(username = contributor.name) -> osm_filtered
usernames_df %<>%
rename(username = filenames) %>%
mutate(username = gsub("Contributors//","", .$username)) %>%
mutate(username = gsub(".json","", .$username)) %>%
left_join(survey, by = "username") %>%
left_join(osm_filtered, by = "username") %>%
drop_na(document.id, gender) %>%
filter(gender != "Prefer not to say") %>%
as_tibble()
dem_table <- table(usernames_df$gender, usernames_df$age)
dem_table %>%
t() %>%
as.data.frame.matrix() %>%
knitr::kable()
Female | Male | |
---|---|---|
>70 | 1 | 5 |
18-24 | 5 | 18 |
25-29 | 9 | 29 |
30-34 | 4 | 33 |
35-39 | 6 | 25 |
40-44 | 2 | 23 |
45-49 | 2 | 19 |
50-54 | 1 | 15 |
55-59 | 0 | 11 |
60-64 | 0 | 6 |
65-69 | 0 | 4 |
usernames_df %>%
group_by(gender) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
## # A tibble: 2 x 3
## gender n freq
## <chr> <int> <dbl>
## 1 Female 30 0.138
## 2 Male 188 0.862
usernames_df %>%
group_by(gender, age) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 19 x 4
## # Groups: gender [2]
## gender age n freq
## <chr> <chr> <int> <dbl>
## 1 Female >70 1 0.0333
## 2 Female 18-24 5 0.167
## 3 Female 25-29 9 0.3
## 4 Female 30-34 4 0.133
## 5 Female 35-39 6 0.2
## 6 Female 40-44 2 0.0667
## 7 Female 45-49 2 0.0667
## 8 Female 50-54 1 0.0333
## 9 Male >70 5 0.0266
## 10 Male 18-24 18 0.0957
## 11 Male 25-29 29 0.154
## 12 Male 30-34 33 0.176
## 13 Male 35-39 25 0.133
## 14 Male 40-44 23 0.122
## 15 Male 45-49 19 0.101
## 16 Male 50-54 15 0.0798
## 17 Male 55-59 11 0.0585
## 18 Male 60-64 6 0.0319
## 19 Male 65-69 4 0.0213
usernames_df %>%
select(country_residence) %>%
table()
## .
## Albania Australia Austria Belarus Belgium
## 1 3 7 1 4
## Brazil Canada Colombia Czech Republic Denmark
## 2 7 2 3 1
## Ethiopia France Germany Hong Kong India
## 1 13 50 1 1
## Indonesia Italy Japan Lithuania Malaysia
## 1 3 3 1 1
## Mexico Netherlands Nicaragua Norway Philippines
## 1 8 1 1 6
## Poland Portugal Russia Slovakia Spain
## 3 1 6 1 1
## Sweden Switzerland Taiwan Uganda UK
## 1 1 1 1 37
## Ukraine USA
## 1 41
usernames_df %>%
select(continent) %>%
table()
## .
## Africa Asia C.America Europe N.America Oceania S.America
## 2 14 2 145 48 3 4
osm_json %>%
as_tbl_json(drop.nulljson = T) %>%
enter_object('changesets') %>%
spread_values(days = jstring(days)) %>%
filter(document.id %in% usernames_df$document.id) %>%
as.data.frame() %>%
pull(2)-> days
data.frame(days) %>%
mutate(days = gsub("\\|$","", days)) %>%
separate_rows(days, sep = "[|]") %>%
separate(days, c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), ",") %>%
mutate(across(where(is.character), as.numeric)) -> days_df
days_df %>% colMeans() %>% round(.,0)
## Sun Mon Tue Wed Thu Fri Sat
## 1095 1017 941 948 942 963 1039
usernames_df %>%
bind_cols(days_df) %>%
mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", ">70")) %>%
as_tibble() -> days_df_combined
days_df_combined %>%
select(gender, age, Sun:Sat) %>%
group_by(gender, age) %>%
rowwise() %>%
mutate(weeksum = sum(c_across(Sun:Sat))) %>%
ungroup() %>%
mutate(across(Sun:Sat, ~ . / weeksum)) %>%
mutate_if(is.numeric, round, digits=3) %>%
select(-weeksum) -> days_df_per
Weekly Contribution (Averaged)
days_df_combined %>%
select(gender, Sun:Sat) %>%
pivot_longer(!gender, names_to = "days", values_to = "count") %>%
group_by(gender) %>%
summarise(mean_cont_per_week = mean(count))
## # A tibble: 2 x 2
## gender mean_cont_per_week
## <chr> <dbl>
## 1 Female 544.
## 2 Male 1064.
days_df_combined %>%
filter(gender != "Prefer not to say") %>%
select(gender, age, Sun:Sat) %>%
rowwise() %>%
mutate(mean_cont_per_week = sum(c_across(where(is.numeric)), na.rm = T)) %>%
select(gender, age, mean_cont_per_week) %>%
ggplot(aes(x = gender, y = mean_cont_per_week, fill = gender)) +
geom_boxplot() +
ylim(0,35000) +
labs(x = "",
y = "Averaged Number of 'Days Active'") +
theme_bw() +
theme(legend.position="bottom")
Gender Comparison
days_df_combined %>%
select(gender, age, Sun:Sat) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Averaged Number of 'Days Active'") +
theme_bw() +
theme(legend.position="bottom")
days_df_combined %>%
select(gender, age, Sun:Sat) %>%
group_by(gender,age) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
facet_wrap(~age, scales = "free_y") +
labs(x = "",
y = "Average Contributions During the Week") +
theme_bw() +
theme(legend.position="bottom",
legend.margin=margin(0,0,0,0),
legend.box.margin=margin(-20,0,0,0),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
days_df_combined %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
select(-c(document.id, tz))
## # A tibble: 2 x 8
## gender Sun Mon Tue Wed Thu Fri Sat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 445. 594. 578. 580. 600. 557. 452.
## 2 Male 1199. 1084. 999. 1007. 997. 1028. 1133.
days_df_combined %>%
filter(gender != "Prefer not to say") %>%
select(gender, age, Sun:Sat) %>%
rowwise() %>%
mutate(mean_cont_per_week = sum(c_across(where(is.numeric)), na.rm = T)) %>%
select(gender, age, mean_cont_per_week) %>%
ggplot(aes(x = gender, y = mean_cont_per_week, fill = age)) +
geom_boxplot() +
ylim(0,35000) +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom")
Gender Comparison by Age group
Weekly Contribution based on ratio
days_df_per %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
mutate_if(is.numeric, round, digits=3)
## # A tibble: 2 x 8
## gender Sun Mon Tue Wed Thu Fri Sat
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Female 0.091 0.143 0.152 0.163 0.17 0.183 0.098
## 2 Male 0.155 0.146 0.14 0.14 0.135 0.142 0.142
days_df_per %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
mutate_if(is.numeric, round, digits=3) %>%
reshape2::melt(id = "gender", variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active (%)") +
theme_bw() +
theme(legend.position="bottom",
text = element_text(size=15))
days_df_per %>%
group_by(gender, age) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
mutate_if(is.numeric, round, digits=3) %>%
reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
geom_hline(yintercept = .142, linetype = "dashed") +
facet_wrap(~age, scales = "free_y") +
labs(x = "",
y = "Days Active (%)") +
theme_bw() +
theme(legend.position="bottom",
legend.margin=margin(0,0,0,0),
legend.box.margin=margin(-20,0,0,0),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `mutate_if()` ignored the following grouping variables:
## Column `gender`
osm_json %>%
as_tbl_json(drop.nulljson = T) %>%
enter_object('changesets') %>%
spread_values(hours = jstring(hours)) %>%
filter(document.id %in% usernames_df$document.id) %>%
as.data.frame() %>%
pull(2)-> hours
data.frame(hours) %>%
mutate(hours = gsub("\\|$","", hours)) %>%
separate_rows(hours, sep = "[|]") %>%
separate(hours, c("h00", "h01", "h02", "h03", "h04", "h05",
"h06", "h07", "h08", "h09", "h10", "h11",
"h12", "h13", "h14", "h15", "h16", "h17",
"h18", "h19", "h20", "h21", "h22", "h23"
), ",") %>%
mutate(across(where(is.character), as.numeric)) -> hours_df
usernames_df %>%
bind_cols(hours_df) %>%
mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", ">70")) -> hours_df_combined
hours_df_combined %<>%
select(document.id, tz, h00:h23) %>%
pivot_longer(!c(document.id, tz), names_to = "UTC", values_to = "Count") %>%
mutate(UTC = as.numeric(str_sub(UTC, -2)),
local = tz + UTC,
local = case_when(local < 24 & local >= 0 ~ local,
local >= 24 ~ local - 24,
local < 0 ~ local + 24),
local = str_c("h0", local),
local = ifelse(local == "h00"|
local == "h01"|
local == "h02"|
local == "h03"|
local == "h04"|
local == "h05"|
local == "h06"|
local == "h07"|
local == "h08"|
local == "h09", local, paste0("h", str_sub(local,-2))
)) %>%
select(document.id, Count, local, -tz) %>%
arrange(document.id, local, Count) %>%
pivot_wider(names_from = local, values_from = Count) %>%
left_join(hours_df_combined %>% select(username:document.id), by = "document.id")
hours_df_combined %>%
select(gender, age, h00:h23) %>%
group_by(gender, age) %>%
rowwise() %>%
mutate(dailysum = sum(c_across(h00:h23))) %>%
ungroup() %>%
mutate(across(h00:h23, ~ . / dailysum)) %>%
mutate_if(is.numeric, round, digits=3) %>%
select(-dailysum) -> hours_df_per
hours_df_combined %>%
select(gender, h00:h23) %>%
pivot_longer(!gender, names_to = "Hours", values_to = "count") %>%
group_by(gender) %>%
summarise(mean = mean(count))
## # A tibble: 2 x 2
## gender mean
## <chr> <dbl>
## 1 Female 159.
## 2 Male 310.
hours_df_combined %>%
select(gender, h00:h23) %>%
pivot_longer(!gender, names_to = "Hours", values_to = "count") %>%
group_by(gender, Hours) %>%
summarise(mean = mean(count)) %>%
pivot_wider(names_from = gender, values_from = mean) %>%
print(n = Inf)
## # A tibble: 24 x 3
## Hours Female Male
## <chr> <dbl> <dbl>
## 1 h00 45.8 229.
## 2 h01 20.5 154.
## 3 h02 14.6 95.8
## 4 h03 35.0 61.2
## 5 h04 102. 59.9
## 6 h05 167. 80.5
## 7 h06 120. 150.
## 8 h07 72.6 227.
## 9 h08 112. 282.
## 10 h09 197. 326.
## 11 h10 264. 372.
## 12 h11 270. 395.
## 13 h12 210. 395.
## 14 h13 246. 415.
## 15 h14 260. 439.
## 16 h15 258. 421.
## 17 h16 238. 431.
## 18 h17 216. 441.
## 19 h18 186. 461.
## 20 h19 173. 449.
## 21 h20 181. 432.
## 22 h21 183. 418.
## 23 h22 143. 389.
## 24 h23 89.8 325.
hours_df_combined %>%
select(gender, age, h00:h23) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom")
UK
hours_df_combined %>%
filter(country_residence == "UK") %>%
select(gender, age, h00:h23) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom")
USA
hours_df_combined %>%
filter(country_residence == "USA") %>%
select(gender, age, h00:h23) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom")
Germany
hours_df_combined %>%
filter(country_residence == "Germany") %>%
select(gender, age, h00:h23) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom")
hours_df_combined %>%
select(gender, age, h00:h23) %>%
group_by(age) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "age", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = age, colour = age)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
hours_df_combined %>%
select(gender, age, h00:h23) %>%
filter(age != ">70") %>% # <- remove >70 aged group
group_by(age, gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = age, colour = age)) +
geom_point() +
geom_line() +
facet_grid(rows = vars(gender)) +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
hours_df_combined %>%
select(gender, age, h00:h23) %>%
group_by(age, gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
facet_wrap(~age, scales = "free_y", ncol = 3) +
labs(x = "",
y = "Days Active") +
theme_bw() +
theme(legend.position="bottom",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
hours_df_per %>%
select(gender, age, h00:h23) %>%
group_by(gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = "gender", variable.name = "Hours", value.name = "Value") %>%
ggplot(aes(x = Hours, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
labs(x = "",
y = "Days Active(%)") +
theme_bw() +
theme(legend.position="bottom")
hours_df_per %>%
select(gender, age, h00:h23) %>%
group_by(age, gender) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
reshape2::melt(id = c("gender","age"), variable.name = "Days", value.name = "Value") %>%
ggplot(aes(x = Days, y = Value, group = gender, colour = gender)) +
geom_point() +
geom_line() +
facet_wrap(~age, scales = "free_y", ncol = 3) +
labs(x = "",
y = "Days Active(%)") +
theme_bw() +
theme(legend.position="bottom",
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
osm_json %>%
as_tbl_json(drop.nulljson = T) %>%
enter_object('changesets') %>%
spread_values(number = jstring(bboxs)) %>%
filter(document.id %in% usernames_df$document.id) %>%
as.data.frame() %>%
pull(2)-> bboxs
data.frame(bboxs) %>%
mutate(bboxs = gsub("\\|$","", bboxs)) %>%
separate_rows(bboxs, sep = ";") %>%
filter(row_number() %% 3 == 1) %>%
mutate(across(where(is.character), as.numeric)) -> bboxs_df
usernames_df %>%
bind_cols(bboxs_df) %>%
drop_na(gender) %>%
mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", ">70")) -> bboxs_df_combined
bboxs_df_combined %>%
select(gender, age, bboxs) %>%
ggplot(aes(x = gender, y = bboxs, fill = gender)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 15000), breaks = seq(0, 15000, by = 1000)) +
labs(x = "",
y = "Spatial Extent of the Users' Main Activity Area (km)") +
theme_bw() +
theme(legend.position="bottom")
## Warning: Removed 24 rows containing non-finite values (stat_boxplot).
bboxs_df_combined %>%
select(gender, age, bboxs) %>%
group_by(gender, age) %>%
summarise(mean_bboxs = mean(bboxs),
observations = paste0("n= ", n())) %>%
mutate(name = paste(gender, age, sep = " ")) %>%
ggplot(aes(x= reorder(name, mean_bboxs), y= mean_bboxs)) +
geom_segment(aes(xend=name, yend=0)) +
geom_point(size=4, aes(group = observations, color=gender)) +
directlabels::geom_dl(aes(label = observations), method = list(directlabels::dl.trans(x = x + 0.3), "last.points", cex = 0.8)) +
coord_flip() +
ylim(0,25000) +
labs(x = "",
y = "Spatial Extent of the Users' Main Activity Area (km)") +
theme_bw() +
xlab("") +
theme(legend.position="bottom")
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
osm_json %>%
as_tbl_json(drop.nulljson = T) %>%
enter_object('countries') %>%
spread_values(number = jstring(countries)) %>%
filter(document.id %in% usernames_df$document.id) %>%
as.data.frame() %>%
pull(2)-> countries
data.frame(No_of_Country = str_count(countries, pattern = ";")) -> countries_df
usernames_df %>%
bind_cols(countries_df) %>%
mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", ">70"))-> countries_df_combined
countries_df_combined %>%
select(gender, age, No_of_Country) %>%
ggplot(aes(x = gender, y = No_of_Country, fill = gender)) +
geom_violin(width=1) +
geom_boxplot(width = 0.1, alpha=0.2) +
scale_y_continuous(limits = c(0, 300), breaks = seq(0, 300, by = 50)) +
labs(x = "",
y = "Number of Countries Contributed") +
theme_bw() +
theme(legend.position="bottom")
countries_df_combined %>%
select(gender, age, No_of_Country) %>%
ggplot(aes(x = age, y = No_of_Country, fill = gender)) +
geom_boxplot() +
scale_y_continuous(limits = c(0, 250), breaks = seq(0, 250, by = 50)) +
labs(x = "",
y = "Number of Countries Contributed") +
theme_bw() +
theme(legend.position="bottom")