filenames <- list.files("Contributors/", pattern="*.json", full.names=TRUE) 
# Import data
osm_json <-filenames %>% 
  map(read_json) %>% 
  reduce(bind_rows) %>% 
  mutate(document.id = row_number())
# Take a look at the data

osm_json %>% 
  spread_all() %>% 
  as_data_frame.tbl_json() %>% 
  mutate(contributor.since = as.Date(contributor.since)) -> osm_tibble

class(osm_tibble)
## [1] "tbl_df"     "tbl"        "data.frame"
##
duration <- as.numeric(osm_tibble$changesets.changes)
summary(duration)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##       71    60362   230727  1178502  1022159 55758489
quantile(duration, c(.01, .05, .32, .57, .98)) 
##         1%         5%        32%        57%        98% 
##     326.77    2299.20   93648.92  330779.04 7851756.94
# Drop users who contributed less than 2000 changesets
osm_tibble %<>% 
  filter(changesets.changes >= 1500)


## year
lubridate::year(osm_tibble$contributor.since) %>% hist(main = "User's First Year of Contribution")

lubridate::year(osm_tibble$contributor.since) %>% table()
## .
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 
##    4    7   19   28   31   27   20   17   25   22   12   13   10
survey_original <- readxl::read_xlsx("OSM survey data.xlsx") %>% select(-c(`(Found) Username`, `6.a. If you selected Other, please specify:`))
  
survey_original %>% 
  rename(username = `1. What is your OpenStreetMap Username?`,
         gender = `2. What gender do you identify as?`,
         age = `3. What is your age?`,
         country_residence = `4. What is your country of residence?`,
         nationality = `5. What is your nationality?`,
         education = `6. What is your highest level of education?`,
         continent = Continent,
         tz = `timezone(UTC)`
         ) -> survey

# Create a dataframe that contains all users names
data.frame(filenames) -> usernames_df

osm_tibble %>% 
  select(document.id, contributor.name) %>% 
  rename(username = contributor.name) -> osm_filtered

usernames_df %<>% 
  rename(username = filenames) %>% 
  mutate(username = gsub("Contributors//","", .$username)) %>% 
  mutate(username = gsub(".json","", .$username)) %>% 
  left_join(survey, by = "username") %>% 
  left_join(osm_filtered, by = "username") %>% 
  drop_na(document.id, gender) %>%
  filter(gender != "Prefer not to say") %>% 
  as_tibble()

1 Days

osm_json %>%
  as_tbl_json(drop.nulljson = T) %>% 
  enter_object('changesets') %>%
  spread_values(days = jstring(days)) %>% 
  filter(document.id %in% usernames_df$document.id) %>%  
  as.data.frame() %>% 
  pull(2)-> days

data.frame(days) %>% 
  mutate(days = gsub("\\|$","", days)) %>%
  separate_rows(days, sep = "[|]") %>%
  separate(days, c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"), ",") %>% 
  mutate(across(where(is.character), as.numeric)) -> days_df


days_df %>% colMeans() %>% round(.,0)
##  Sun  Mon  Tue  Wed  Thu  Fri  Sat 
## 1095 1017  941  948  942  963 1039
usernames_df %>%  
  bind_cols(days_df) %>% 
  mutate(age = fct_relevel(age, "18-24", "25-29", "30-34", "35-39", "40-44", 
                           "45-49", "50-54", "55-59", "60-64", "65-69", ">70")) %>% 
  as_tibble() -> days_df_combined

days_df_combined %>% 
  group_by(gender) %>% 
  summarise(
    count = n(),
    median = median(Sun, na.rm = TRUE),
    IQR = IQR(Sun, na.rm = TRUE)
  )
## # A tibble: 2 x 4
##   gender count median   IQR
##   <chr>  <int>  <dbl> <dbl>
## 1 Female    30   26.5  222.
## 2 Male     188  540.  1366.
days_df_combined %<>% 
  mutate(age_combined = case_when(age == "18-24" ~ "und40",
                                  age == "25-29" ~ "und40",
                                  age == "30-34" ~ "und40",
                                  age == "35-39" ~ "ov40",
                                  age == "40-44" ~ "ov40",
                                  age == "45-49" ~ "ov40",
                                  age == "50-54" ~ "ov40",
                                  age == "55-59" ~ "ov40",
                                  age == "60-64" ~ "ov40",
                                  age == "65-69" ~ "ov40",
                                  age == ">70" ~ "ov40"
                                  ))

2 Normality Test

There are several methods for normality test such as Kolmogorov-Smirnov (K-S) normality test and Shapiro-Wilk’s test.

Shapiro-Wilk’s method is widely recommended for normality test and it provides better power than K-S. It is based on the correlation between the data and the corresponding normal scores. “sthda webpage”

2.1 Shapiro Test: A way to test the normality of data

days_df_combined %>% 
  select(gender, Sun:Sat) %>% 
  group_by(gender) %>% 
  rowwise() %>% 
  mutate(weekday = sum(c_across(Mon:Fri)),
         weekend = sum(Sun, Sat),
         all = sum(c_across(Sun:Sat))) -> normality_days_df



normality_days_df %>% 
  group_by(gender) %>%
  shapiro_test(weekday, weekend, all)
## # A tibble: 6 x 4
##   gender variable statistic        p
##   <chr>  <chr>        <dbl>    <dbl>
## 1 Female all          0.571 3.26e- 8
## 2 Female weekday      0.599 7.17e- 8
## 3 Female weekend      0.503 5.70e- 9
## 4 Male   all          0.656 2.60e-19
## 5 Male   weekday      0.652 2.05e-19
## 6 Male   weekend      0.652 2.01e-19

From the output, the p-value < 0.05 implying that the distribution of the data are significantly different from normal distribution. In other words, we can not assume the normality from the data. Thus we need to move on to a non-parametric metric that would statistically compare the differences between groups.

2.2 Visualisation

Here are the visual evidence.

library(ggpubr)
# Density plot
#ggdensity(log(normality_days_df$all), fill = "lightgray")
ggdensity(normality_days_df$weekday, fill = "lightgray")

ggdensity(normality_days_df$weekend, fill = "lightgray")

# QQ plot
#ggqqplot(log(normality_days_df$all))
ggqqplot(normality_days_df$all)

# Plot weight by group and color by group
library(ggpubr)
ggboxplot(normality_days_df, x = "gender", y = "weekday", 
          color = "gender", palette = c("#00AFBB", "#DC143C"),
          ylab = "Weight", xlab = "Groups")

normality_days_df %>% 
  ggplot(aes(x=log(weekday), fill=gender)) +
  geom_histogram(position = "identity", alpha = 0.8, bins = 30, colour = "grey40") +
  theme_void() +
  labs(fill="")

normality_days_df %>% 
  ggplot(aes(x=log(weekend), fill=gender)) +
  geom_histogram(position = "identity", alpha = 0.8, bins = 30, colour = "grey40") +
  theme_void() +
  labs(fill="")
## Warning: Removed 4 rows containing non-finite values (stat_bin).