filenames <- list.files("Contributors/", pattern="*.json", full.names=TRUE)
# Import data
osm_json <-filenames %>%
map(read_json) %>%
reduce(bind_rows) %>%
mutate(document.id = row_number())
# Take a look at the data
osm_json %>%
spread_all() %>%
as_data_frame.tbl_json() %>%
mutate(contributor.since = as.Date(contributor.since)) -> osm_tibble
class(osm_tibble)
## [1] "tbl_df" "tbl" "data.frame"
##
duration <- as.numeric(osm_tibble$changesets.changes)
summary(duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71 60362 230727 1178502 1022159 55758489
quantile(duration, c(.01, .05, .32, .57, .98))
## 1% 5% 32% 57% 98%
## 326.77 2299.20 93648.92 330779.04 7851756.94
# Drop users who contributed less than 2000 changesets
osm_tibble %<>%
filter(changesets.changes >= 1500)
## year
lubridate::year(osm_tibble$contributor.since) %>% hist(main = "User's First Year of Contribution")
lubridate::year(osm_tibble$contributor.since) %>% table()
## .
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
## 4 7 19 28 31 27 20 17 25 22 12 13 10
survey_original <- readxl::read_xlsx("OSM survey data.xlsx") %>% select(-c(`(Found) Username`, `6.a. If you selected Other, please specify:`))
survey_original %>%
rename(username = `1. What is your OpenStreetMap Username?`,
gender = `2. What gender do you identify as?`,
age = `3. What is your age?`,
country_residence = `4. What is your country of residence?`,
nationality = `5. What is your nationality?`,
education = `6. What is your highest level of education?`,
continent = Continent,
tz = `timezone(UTC)`
) -> survey
# Create a dataframe that contains all users names
data.frame(filenames) -> usernames_df
osm_tibble %>%
select(document.id, contributor.name) %>%
rename(username = contributor.name) -> osm_filtered
usernames_df %<>%
rename(username = filenames) %>%
mutate(username = gsub("Contributors//","", .$username)) %>%
mutate(username = gsub(".json","", .$username)) %>%
left_join(survey, by = "username") %>%
left_join(osm_filtered, by = "username") %>%
drop_na(document.id, gender) %>%
filter(gender != "Prefer not to say") %>%
as_tibble()
osm_json %>%
as_tbl_json(drop.nulljson = T) %>%
enter_object('changesets') %>%
spread_values(days = jstring(days)) %>%
filter(document.id %in% usernames_df$document.id) %>%
as.data.frame() %>%
pull(2)-> days
data.frame(days) %>%
mutate(days = gsub("\\|$","", days)) %>%
separate_rows(days, sep = "[|]") %>%
separate(days, c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), ",") %>%
mutate(across(where(is.character), as.numeric)) -> days_df
days_df %>% colMeans() %>% round(.,0)
## Sun Mon Tue Wed Thu Fri Sat
## 1095 1017 941 948 942 963 1039
usernames_df %>%
bind_cols(days_df) %>%
mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44",
"45-49", "50-54", "55-59", "60-64", "65-69", ">70")) %>%
as_tibble() -> days_df_combined
days_df_combined %>%
group_by(gender) %>%
summarise(
count = n(),
median = median(Sun, na.rm = TRUE),
IQR = IQR(Sun, na.rm = TRUE)
)
## # A tibble: 2 x 4
## gender count median IQR
## <chr> <int> <dbl> <dbl>
## 1 Female 30 26.5 222.
## 2 Male 188 540. 1366.
days_df_combined %<>%
mutate(age_combined = case_when(age == "18-24" ~ "und40",
age == "25-29" ~ "und40",
age == "30-34" ~ "und40",
age == "35-39" ~ "ov40",
age == "40-44" ~ "ov40",
age == "45-49" ~ "ov40",
age == "50-54" ~ "ov40",
age == "55-59" ~ "ov40",
age == "60-64" ~ "ov40",
age == "65-69" ~ "ov40",
age == ">70" ~ "ov40"
))
There are several methods for normality test such as Kolmogorov-Smirnov (K-S) normality test and Shapiro-Wilk’s test.
Shapiro-Wilk’s method is widely recommended for normality test and it provides better power than K-S. It is based on the correlation between the data and the corresponding normal scores. “sthda webpage”
days_df_combined %>%
select(gender, Sun:Sat) %>%
group_by(gender) %>%
rowwise() %>%
mutate(weekday = sum(c_across(Mon:Fri)),
weekend = sum(Sun, Sat),
all = sum(c_across(Sun:Sat))) -> normality_days_df
normality_days_df %>%
group_by(gender) %>%
shapiro_test(weekday, weekend, all)
## # A tibble: 6 x 4
## gender variable statistic p
## <chr> <chr> <dbl> <dbl>
## 1 Female all 0.571 3.26e- 8
## 2 Female weekday 0.599 7.17e- 8
## 3 Female weekend 0.503 5.70e- 9
## 4 Male all 0.656 2.60e-19
## 5 Male weekday 0.652 2.05e-19
## 6 Male weekend 0.652 2.01e-19
From the output, the p-value < 0.05 implying that the distribution of the data are significantly different from normal distribution. In other words, we can not assume the normality from the data. Thus we need to move on to a non-parametric metric that would statistically compare the differences between groups.
Here are the visual evidence.
library(ggpubr)
# Density plot
#ggdensity(log(normality_days_df$all), fill = "lightgray")
ggdensity(normality_days_df$weekday, fill = "lightgray")
ggdensity(normality_days_df$weekend, fill = "lightgray")
# QQ plot
#ggqqplot(log(normality_days_df$all))
ggqqplot(normality_days_df$all)
# Plot weight by group and color by group
library(ggpubr)
ggboxplot(normality_days_df, x = "gender", y = "weekday",
color = "gender", palette = c("#00AFBB", "#DC143C"),
ylab = "Weight", xlab = "Groups")
normality_days_df %>%
ggplot(aes(x=log(weekday), fill=gender)) +
geom_histogram(position = "identity", alpha = 0.8, bins = 30, colour = "grey40") +
theme_void() +
labs(fill="")
normality_days_df %>%
ggplot(aes(x=log(weekend), fill=gender)) +
geom_histogram(position = "identity", alpha = 0.8, bins = 30, colour = "grey40") +
theme_void() +
labs(fill="")
## Warning: Removed 4 rows containing non-finite values (stat_bin).