Data and R code to reproduce the analysis underlying this Aug. 14, 2022 Inside Climate News article, scrutinizing the text of the 2024 edition of the U.S. State Department’s Country Reports on Human Rights Practices, compared to reports from previous years.
We downloaded the individual country pages for all reports from 2016
to 2024, which are in the folder html
, then parsed their
contents using the script scraper.R
. The resulting data,
including the full text of each country report, is in the file
reports.RData
in the processed_data
folder.
# load required packages
library(tidyverse)
library(tidytext)
library(quanteda)
library(DT)
library(wordcloud)
# load data
load("processed_data/reports.RData")
This code counts mentions of specific words and the number of words in each country report, for each year.
# mentions of key terms and word counts for each document
reports <- reports %>%
mutate(word_count = str_count(report_text,"\\S+"),
mentions_human_rights = str_extract_all(report_text,regex("human rights",ignore_case = TRUE),simplify = FALSE),
mentions_defender = str_extract_all(report_text,regex("defender",ignore_case = TRUE),simplify = FALSE),
mentions_journalis = str_extract_all(report_text,regex("journalis",ignore_case = TRUE),simplify = FALSE),
mentions_democ = str_extract_all(report_text,regex("democ",ignore_case = TRUE),simplify = FALSE),
mentions_lgbt = str_extract_all(report_text,regex("lgbt",ignore_case = TRUE),simplify = FALSE),
mentions_women = str_extract_all(report_text,regex("women",ignore_case = TRUE),simplify = FALSE),
mentions_gender = str_extract_all(report_text,regex("gender",ignore_case = TRUE),simplify = FALSE),
mentions_indigenous = str_extract_all(report_text,regex("indigenous",ignore_case = TRUE),simplify = FALSE),
mentions_refugee = str_extract_all(report_text,regex("refugee",ignore_case = TRUE),simplify = FALSE),
mentions_corrupt = str_extract_all(report_text,regex("corrupt",ignore_case = TRUE),simplify = FALSE),
mentions_torture = str_extract_all(report_text,regex("torture",ignore_case = TRUE),simplify = FALSE),
mentions_oppress = str_extract_all(report_text,regex("oppress",ignore_case = TRUE),simplify = FALSE),
mentions_censor = str_extract_all(report_text,regex("censor",ignore_case = TRUE),simplify = FALSE),
mentions_forest = str_extract_all(report_text,regex("forest",ignore_case = TRUE),simplify = FALSE),
mentions_logging = str_extract_all(report_text,regex("logging",ignore_case = TRUE),simplify = FALSE),
mentions_environmental = str_extract_all(report_text,regex("environmental",ignore_case = TRUE),simplify = FALSE),
mentions_climate_change = str_extract_all(report_text,regex("climate change",ignore_case = TRUE),simplify = FALSE)) %>%
rowwise() %>%
mutate(count_human_rights = length(mentions_human_rights),
count_defender = length(mentions_defender),
count_journalis = length(mentions_journalis),
count_democ = length(mentions_democ),
count_lgbt = length(mentions_lgbt),
count_women = length(mentions_women),
count_gender = length(mentions_gender),
count_indigenous = length(mentions_indigenous),
count_refugee = length(mentions_refugee),
count_corrupt = length(mentions_corrupt),
count_torture = length(mentions_torture),
count_oppress = length(mentions_oppress),
count_censor = length(mentions_censor),
count_forest = length(mentions_forest),
count_logging = length(mentions_logging),
count_environmental = length(mentions_environmental),
count_climate_change = length(mentions_climate_change)) %>%
ungroup()
We then summarized the data by year, calculating the rate at which
each term was mentioned per 100,000 words. The summary data is saved in
the file terms_summary.csv
in the
processed_data
folder. Based on this summary, we selected
specific terms/phrases for further scrutiny.
terms_summary <- reports %>%
group_by(year) %>%
summarize(reports = n(),
word_count = sum(word_count),
count_human_rights = sum(count_human_rights),
count_defender = sum(count_defender),
count_journalis = sum(count_journalis),
count_democ = sum(count_democ),
count_lgbt = sum(count_lgbt),
count_women = sum(count_women),
count_gender = sum(count_gender),
count_indigenous = sum(count_indigenous),
count_refugee = sum(count_refugee),
count_corrupt = sum(count_corrupt),
count_torture = sum(count_torture),
count_oppress = sum(count_oppress),
count_censor = sum(count_censor),
count_forest = sum(count_forest),
count_logging = sum(count_logging),
count_environmental = sum(count_environmental),
count_climate_change = sum(count_climate_change)) %>%
mutate(rate_human_rights = round(count_human_rights/word_count*10^5,2),
rate_defender = round(count_defender/word_count*10^5,2),
rate_journalis = round(count_journalis/word_count*10^5,2),
rate_lgbt = round(count_lgbt/word_count*10^5,2),
rate_women = round(count_women/word_count*10^5,2),
rate_gender = round(count_gender/word_count*10^5,2),
rate_indigenous = round(count_indigenous/word_count*10^5,2),
rate_refugee = round(count_refugee/word_count*10^5,2),
rate_corrupt = round(count_corrupt/word_count*10^5,2),
rate_torture = round(count_torture/word_count*10^5,2),
rate_oppress = round(count_oppress/word_count*10^5,2),
rate_censor = round(count_censor/word_count*10^5,2),
rate_forest = round(count_forest/word_count*10^5,2),
rate_logging = round(count_logging/word_count*10^5,2),
rate_environmental = round(count_environmental/word_count*10^5,2),
rate_climate_change = round(count_climate_change/word_count*10^5,2))
write_csv(terms_summary, "processed_data/terms_summary.csv", na = "")
terms_summary %>%
mutate(word_count = prettyNum(word_count, big.mark = ",")) %>%
select(Year = year, `Number of country reports` = reports, `Total words` = word_count) %>%
datatable()
All values shown are the rate of mentions per 100,000 words.
terms_summary %>%
select(Year = year,`Climate change` = rate_climate_change, `LGBT` = rate_lgbt, `Women` = rate_women,`Gender` = rate_gender,`Indigenous` = rate_indigenous, `Corrupt*` = rate_corrupt) %>%
datatable()
To understand the contextin which specific words/phrases of interest are used, we made searchable web tables displaying mentions in 2023 and 2044 together with preceding and subsequent text.
term = "climate change"
context <- reports %>%
mutate(report_text = tolower(report_text)) %>%
filter(grepl(term,report_text)) %>%
select(year,country,report_text)
term_in_context <- tibble()
for (y in c(2023,2024)) {
for(c in context$country) {
tmp <- context %>%
filter(year == y & country == c)
tmp2 <- kwic(tokens(tmp$report_text[1]),
pattern = phrase(term),
window = 20) %>%
as_tibble() %>%
mutate(year = y,country = c)
term_in_context <- bind_rows(term_in_context,tmp2)
}
}
term_in_context %>%
unique() %>%
select(year,country,pre,pattern,post) %>%
datatable()
term <- "indigenous"
context <- reports %>%
mutate(report_text = tolower(report_text)) %>%
filter(grepl(term,report_text)) %>%
select(year,country,report_text)
term_in_context <- tibble()
for (y in c(2023,2024)) {
for(c in context$country) {
tmp <- context %>%
filter(year == y & country == c)
tmp2 <- kwic(tokens(tmp$report_text[1]),
pattern = phrase(term),
window = 20) %>%
as_tibble() %>%
mutate(year = y,country = c)
term_in_context <- bind_rows(term_in_context,tmp2)
}
}
term_in_context %>%
unique() %>%
select(year,country,pre,pattern,post) %>%
datatable()
The asterisk ensures that words corrupt as their root, such as “corruption,” are also included.
term <- "corrupt*"
context <- reports %>%
mutate(report_text = tolower(report_text)) %>%
filter(grepl(term,report_text)) %>%
select(year,country,report_text)
term_in_context <- tibble()
for (y in c(2023,2024)) {
for(c in context$country) {
tmp <- context %>%
filter(year == y & country == c)
tmp2 <- kwic(tokens(tmp$report_text[1]),
pattern = phrase(term),
window = 20) %>%
as_tibble() %>%
mutate(year = y,country = c)
term_in_context <- bind_rows(term_in_context,tmp2)
}
}
term_in_context %>%
unique() %>%
select(year,country,pre,pattern,post) %>%
datatable()